/home/arjun/llvm-project/llvm/lib/Support/regcomp.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*- |
2 | | * This code is derived from OpenBSD's libc/regex, original license follows: |
3 | | * |
4 | | * Copyright (c) 1992, 1993, 1994 Henry Spencer. |
5 | | * Copyright (c) 1992, 1993, 1994 |
6 | | * The Regents of the University of California. All rights reserved. |
7 | | * |
8 | | * This code is derived from software contributed to Berkeley by |
9 | | * Henry Spencer. |
10 | | * |
11 | | * Redistribution and use in source and binary forms, with or without |
12 | | * modification, are permitted provided that the following conditions |
13 | | * are met: |
14 | | * 1. Redistributions of source code must retain the above copyright |
15 | | * notice, this list of conditions and the following disclaimer. |
16 | | * 2. Redistributions in binary form must reproduce the above copyright |
17 | | * notice, this list of conditions and the following disclaimer in the |
18 | | * documentation and/or other materials provided with the distribution. |
19 | | * 3. Neither the name of the University nor the names of its contributors |
20 | | * may be used to endorse or promote products derived from this software |
21 | | * without specific prior written permission. |
22 | | * |
23 | | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
24 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
25 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
26 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
27 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
28 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
29 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
30 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
31 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
32 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
33 | | * SUCH DAMAGE. |
34 | | * |
35 | | * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 |
36 | | */ |
37 | | |
38 | | #include <sys/types.h> |
39 | | #include <stdint.h> |
40 | | #include <stdio.h> |
41 | | #include <string.h> |
42 | | #include <ctype.h> |
43 | | #include <limits.h> |
44 | | #include <stdlib.h> |
45 | | #include "regex_impl.h" |
46 | | |
47 | | #include "regutils.h" |
48 | | #include "regex2.h" |
49 | | |
50 | | #include "llvm/Config/config.h" |
51 | | #include "llvm/Support/Compiler.h" |
52 | | |
53 | | /* character-class table */ |
54 | | static struct cclass { |
55 | | const char *name; |
56 | | const char *chars; |
57 | | const char *multis; |
58 | | } cclasses[] = { |
59 | | { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ |
60 | | 0123456789", ""} , |
61 | | { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", |
62 | | ""} , |
63 | | { "blank", " \t", ""} , |
64 | | { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\ |
65 | | \25\26\27\30\31\32\33\34\35\36\37\177", ""} , |
66 | | { "digit", "0123456789", ""} , |
67 | | { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ |
68 | | 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", |
69 | | ""} , |
70 | | { "lower", "abcdefghijklmnopqrstuvwxyz", |
71 | | ""} , |
72 | | { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ |
73 | | 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", |
74 | | ""} , |
75 | | { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", |
76 | | ""} , |
77 | | { "space", "\t\n\v\f\r ", ""} , |
78 | | { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", |
79 | | ""} , |
80 | | { "xdigit", "0123456789ABCDEFabcdef", |
81 | | ""} , |
82 | | { NULL, 0, "" } |
83 | | }; |
84 | | |
85 | | /* character-name table */ |
86 | | static struct cname { |
87 | | const char *name; |
88 | | char code; |
89 | | } cnames[] = { |
90 | | { "NUL", '\0' }, |
91 | | { "SOH", '\001' }, |
92 | | { "STX", '\002' }, |
93 | | { "ETX", '\003' }, |
94 | | { "EOT", '\004' }, |
95 | | { "ENQ", '\005' }, |
96 | | { "ACK", '\006' }, |
97 | | { "BEL", '\007' }, |
98 | | { "alert", '\007' }, |
99 | | { "BS", '\010' }, |
100 | | { "backspace", '\b' }, |
101 | | { "HT", '\011' }, |
102 | | { "tab", '\t' }, |
103 | | { "LF", '\012' }, |
104 | | { "newline", '\n' }, |
105 | | { "VT", '\013' }, |
106 | | { "vertical-tab", '\v' }, |
107 | | { "FF", '\014' }, |
108 | | { "form-feed", '\f' }, |
109 | | { "CR", '\015' }, |
110 | | { "carriage-return", '\r' }, |
111 | | { "SO", '\016' }, |
112 | | { "SI", '\017' }, |
113 | | { "DLE", '\020' }, |
114 | | { "DC1", '\021' }, |
115 | | { "DC2", '\022' }, |
116 | | { "DC3", '\023' }, |
117 | | { "DC4", '\024' }, |
118 | | { "NAK", '\025' }, |
119 | | { "SYN", '\026' }, |
120 | | { "ETB", '\027' }, |
121 | | { "CAN", '\030' }, |
122 | | { "EM", '\031' }, |
123 | | { "SUB", '\032' }, |
124 | | { "ESC", '\033' }, |
125 | | { "IS4", '\034' }, |
126 | | { "FS", '\034' }, |
127 | | { "IS3", '\035' }, |
128 | | { "GS", '\035' }, |
129 | | { "IS2", '\036' }, |
130 | | { "RS", '\036' }, |
131 | | { "IS1", '\037' }, |
132 | | { "US", '\037' }, |
133 | | { "space", ' ' }, |
134 | | { "exclamation-mark", '!' }, |
135 | | { "quotation-mark", '"' }, |
136 | | { "number-sign", '#' }, |
137 | | { "dollar-sign", '$' }, |
138 | | { "percent-sign", '%' }, |
139 | | { "ampersand", '&' }, |
140 | | { "apostrophe", '\'' }, |
141 | | { "left-parenthesis", '(' }, |
142 | | { "right-parenthesis", ')' }, |
143 | | { "asterisk", '*' }, |
144 | | { "plus-sign", '+' }, |
145 | | { "comma", ',' }, |
146 | | { "hyphen", '-' }, |
147 | | { "hyphen-minus", '-' }, |
148 | | { "period", '.' }, |
149 | | { "full-stop", '.' }, |
150 | | { "slash", '/' }, |
151 | | { "solidus", '/' }, |
152 | | { "zero", '0' }, |
153 | | { "one", '1' }, |
154 | | { "two", '2' }, |
155 | | { "three", '3' }, |
156 | | { "four", '4' }, |
157 | | { "five", '5' }, |
158 | | { "six", '6' }, |
159 | | { "seven", '7' }, |
160 | | { "eight", '8' }, |
161 | | { "nine", '9' }, |
162 | | { "colon", ':' }, |
163 | | { "semicolon", ';' }, |
164 | | { "less-than-sign", '<' }, |
165 | | { "equals-sign", '=' }, |
166 | | { "greater-than-sign", '>' }, |
167 | | { "question-mark", '?' }, |
168 | | { "commercial-at", '@' }, |
169 | | { "left-square-bracket", '[' }, |
170 | | { "backslash", '\\' }, |
171 | | { "reverse-solidus", '\\' }, |
172 | | { "right-square-bracket", ']' }, |
173 | | { "circumflex", '^' }, |
174 | | { "circumflex-accent", '^' }, |
175 | | { "underscore", '_' }, |
176 | | { "low-line", '_' }, |
177 | | { "grave-accent", '`' }, |
178 | | { "left-brace", '{' }, |
179 | | { "left-curly-bracket", '{' }, |
180 | | { "vertical-line", '|' }, |
181 | | { "right-brace", '}' }, |
182 | | { "right-curly-bracket", '}' }, |
183 | | { "tilde", '~' }, |
184 | | { "DEL", '\177' }, |
185 | | { NULL, 0 } |
186 | | }; |
187 | | |
188 | | /* |
189 | | * parse structure, passed up and down to avoid global variables and |
190 | | * other clumsinesses |
191 | | */ |
192 | | struct parse { |
193 | | char *next; /* next character in RE */ |
194 | | char *end; /* end of string (-> NUL normally) */ |
195 | | int error; /* has an error been seen? */ |
196 | | sop *strip; /* malloced strip */ |
197 | | sopno ssize; /* malloced strip size (allocated) */ |
198 | | sopno slen; /* malloced strip length (used) */ |
199 | | int ncsalloc; /* number of csets allocated */ |
200 | | struct re_guts *g; |
201 | 0 | # define NPAREN 10 /* we need to remember () 1-9 for back refs */ |
202 | | sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ |
203 | | sopno pend[NPAREN]; /* -> ) ([0] unused) */ |
204 | | }; |
205 | | |
206 | | static void p_ere(struct parse *, int); |
207 | | static void p_ere_exp(struct parse *); |
208 | | static void p_str(struct parse *); |
209 | | static void p_bre(struct parse *, int, int); |
210 | | static int p_simp_re(struct parse *, int); |
211 | | static int p_count(struct parse *); |
212 | | static void p_bracket(struct parse *); |
213 | | static void p_b_term(struct parse *, cset *); |
214 | | static void p_b_cclass(struct parse *, cset *); |
215 | | static void p_b_eclass(struct parse *, cset *); |
216 | | static char p_b_symbol(struct parse *); |
217 | | static char p_b_coll_elem(struct parse *, int); |
218 | | static char othercase(int); |
219 | | static void bothcases(struct parse *, int); |
220 | | static void ordinary(struct parse *, int); |
221 | | static void nonnewline(struct parse *); |
222 | | static void repeat(struct parse *, sopno, int, int); |
223 | | static int seterr(struct parse *, int); |
224 | | static cset *allocset(struct parse *); |
225 | | static void freeset(struct parse *, cset *); |
226 | | static int freezeset(struct parse *, cset *); |
227 | | static int firstch(struct parse *, cset *); |
228 | | static int nch(struct parse *, cset *); |
229 | | static void mcadd(struct parse *, cset *, const char *); |
230 | | static void mcinvert(struct parse *, cset *); |
231 | | static void mccase(struct parse *, cset *); |
232 | | static int isinsets(struct re_guts *, int); |
233 | | static int samesets(struct re_guts *, int, int); |
234 | | static void categorize(struct parse *, struct re_guts *); |
235 | | static sopno dupl(struct parse *, sopno, sopno); |
236 | | static void doemit(struct parse *, sop, size_t); |
237 | | static void doinsert(struct parse *, sop, size_t, sopno); |
238 | | static void dofwd(struct parse *, sopno, sop); |
239 | | static void enlarge(struct parse *, sopno); |
240 | | static void stripsnug(struct parse *, struct re_guts *); |
241 | | static void findmust(struct parse *, struct re_guts *); |
242 | | static sopno pluscount(struct parse *, struct re_guts *); |
243 | | |
244 | | static char nuls[10]; /* place to point scanner in event of error */ |
245 | | |
246 | | /* |
247 | | * macros for use with parse structure |
248 | | * BEWARE: these know that the parse structure is named `p' !!! |
249 | | */ |
250 | 0 | #define PEEK() (*p->next) |
251 | 0 | #define PEEK2() (*(p->next+1)) |
252 | 0 | #define MORE() (p->next < p->end) |
253 | 0 | #define MORE2() (p->next+1 < p->end) |
254 | 0 | #define SEE(c) (MORE() && PEEK() == (c)) |
255 | 0 | #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) |
256 | 0 | #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) |
257 | 0 | #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) |
258 | 0 | #define NEXT() (p->next++) |
259 | 0 | #define NEXT2() (p->next += 2) |
260 | 0 | #define NEXTn(n) (p->next += (n)) |
261 | 0 | #define GETNEXT() (*p->next++) |
262 | 0 | #define SETERROR(e) seterr(p, (e)) |
263 | 0 | #define REQUIRE(co, e) (void)((co) || SETERROR(e)) |
264 | | #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) |
265 | 0 | #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) |
266 | | #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) |
267 | 0 | #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) |
268 | 0 | #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) |
269 | 0 | #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) |
270 | 0 | #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) |
271 | 0 | #define HERE() (p->slen) |
272 | 0 | #define THERE() (p->slen - 1) |
273 | | #define THERETHERE() (p->slen - 2) |
274 | 0 | #define DROP(n) (p->slen -= (n)) |
275 | | |
276 | | #ifdef _POSIX2_RE_DUP_MAX |
277 | 0 | #define DUPMAX _POSIX2_RE_DUP_MAX |
278 | | #else |
279 | | #define DUPMAX 255 |
280 | | #endif |
281 | 0 | #define INFINITY (DUPMAX + 1) |
282 | | |
283 | | #ifndef NDEBUG |
284 | | static int never = 0; /* for use in asserts; shuts lint up */ |
285 | | #else |
286 | | #define never 0 /* some <assert.h>s have bugs too */ |
287 | | #endif |
288 | | |
289 | | /* |
290 | | - llvm_regcomp - interface for parser and compilation |
291 | | */ |
292 | | int /* 0 success, otherwise REG_something */ |
293 | | llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) |
294 | 0 | { |
295 | 0 | struct parse pa; |
296 | 0 | struct re_guts *g; |
297 | 0 | struct parse *p = &pa; |
298 | 0 | int i; |
299 | 0 | size_t len; |
300 | | #ifdef REDEBUG |
301 | | # define GOODFLAGS(f) (f) |
302 | | #else |
303 | 0 | # define GOODFLAGS(f) ((f)&~REG_DUMP) |
304 | 0 | #endif |
305 | 0 |
|
306 | 0 | cflags = GOODFLAGS(cflags); |
307 | 0 | if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) |
308 | 0 | return(REG_INVARG); |
309 | 0 |
|
310 | 0 | if (cflags®_PEND) { |
311 | 0 | if (preg->re_endp < pattern) |
312 | 0 | return(REG_INVARG); |
313 | 0 | len = preg->re_endp - pattern; |
314 | 0 | } else |
315 | 0 | len = strlen((const char *)pattern); |
316 | 0 |
|
317 | 0 | /* do the mallocs early so failure handling is easy */ |
318 | 0 | g = (struct re_guts *)malloc(sizeof(struct re_guts) + |
319 | 0 | (NC-1)*sizeof(cat_t)); |
320 | 0 | if (g == NULL) |
321 | 0 | return(REG_ESPACE); |
322 | 0 | p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ |
323 | 0 | p->strip = (sop *)calloc(p->ssize, sizeof(sop)); |
324 | 0 | p->slen = 0; |
325 | 0 | if (p->strip == NULL) { |
326 | 0 | free((char *)g); |
327 | 0 | return(REG_ESPACE); |
328 | 0 | } |
329 | 0 |
|
330 | 0 | /* set things up */ |
331 | 0 | p->g = g; |
332 | 0 | p->next = (char *)pattern; /* convenience; we do not modify it */ |
333 | 0 | p->end = p->next + len; |
334 | 0 | p->error = 0; |
335 | 0 | p->ncsalloc = 0; |
336 | 0 | for (i = 0; i < NPAREN; i++) { |
337 | 0 | p->pbegin[i] = 0; |
338 | 0 | p->pend[i] = 0; |
339 | 0 | } |
340 | 0 | g->csetsize = NC; |
341 | 0 | g->sets = NULL; |
342 | 0 | g->setbits = NULL; |
343 | 0 | g->ncsets = 0; |
344 | 0 | g->cflags = cflags; |
345 | 0 | g->iflags = 0; |
346 | 0 | g->nbol = 0; |
347 | 0 | g->neol = 0; |
348 | 0 | g->must = NULL; |
349 | 0 | g->mlen = 0; |
350 | 0 | g->nsub = 0; |
351 | 0 | g->ncategories = 1; /* category 0 is "everything else" */ |
352 | 0 | g->categories = &g->catspace[-(CHAR_MIN)]; |
353 | 0 | (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); |
354 | 0 | g->backrefs = 0; |
355 | 0 |
|
356 | 0 | /* do it */ |
357 | 0 | EMIT(OEND, 0); |
358 | 0 | g->firststate = THERE(); |
359 | 0 | if (cflags®_EXTENDED) |
360 | 0 | p_ere(p, OUT); |
361 | 0 | else if (cflags®_NOSPEC) |
362 | 0 | p_str(p); |
363 | 0 | else |
364 | 0 | p_bre(p, OUT, OUT); |
365 | 0 | EMIT(OEND, 0); |
366 | 0 | g->laststate = THERE(); |
367 | 0 |
|
368 | 0 | /* tidy up loose ends and fill things in */ |
369 | 0 | categorize(p, g); |
370 | 0 | stripsnug(p, g); |
371 | 0 | findmust(p, g); |
372 | 0 | g->nplus = pluscount(p, g); |
373 | 0 | g->magic = MAGIC2; |
374 | 0 | preg->re_nsub = g->nsub; |
375 | 0 | preg->re_g = g; |
376 | 0 | preg->re_magic = MAGIC1; |
377 | 0 | #ifndef REDEBUG |
378 | 0 | /* not debugging, so can't rely on the assert() in llvm_regexec() */ |
379 | 0 | if (g->iflags®EX_BAD) |
380 | 0 | SETERROR(REG_ASSERT); |
381 | 0 | #endif |
382 | 0 |
|
383 | 0 | /* win or lose, we're done */ |
384 | 0 | if (p->error != 0) /* lose */ |
385 | 0 | llvm_regfree(preg); |
386 | 0 | return(p->error); |
387 | 0 | } |
388 | | |
389 | | /* |
390 | | - p_ere - ERE parser top level, concatenation and alternation |
391 | | */ |
392 | | static void |
393 | | p_ere(struct parse *p, int stop) /* character this ERE should end at */ |
394 | 0 | { |
395 | 0 | char c; |
396 | 0 | sopno prevback = 0; |
397 | 0 | sopno prevfwd = 0; |
398 | 0 | sopno conc; |
399 | 0 | int first = 1; /* is this the first alternative? */ |
400 | 0 |
|
401 | 0 | for (;;) { |
402 | 0 | /* do a bunch of concatenated expressions */ |
403 | 0 | conc = HERE(); |
404 | 0 | while (MORE() && (c = PEEK()) != '|' && c != stop) |
405 | 0 | p_ere_exp(p); |
406 | 0 | REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ |
407 | 0 |
|
408 | 0 | if (!EAT('|')) |
409 | 0 | break; /* NOTE BREAK OUT */ |
410 | 0 | |
411 | 0 | if (first) { |
412 | 0 | INSERT(OCH_, conc); /* offset is wrong */ |
413 | 0 | prevfwd = conc; |
414 | 0 | prevback = conc; |
415 | 0 | first = 0; |
416 | 0 | } |
417 | 0 | ASTERN(OOR1, prevback); |
418 | 0 | prevback = THERE(); |
419 | 0 | AHEAD(prevfwd); /* fix previous offset */ |
420 | 0 | prevfwd = HERE(); |
421 | 0 | EMIT(OOR2, 0); /* offset is very wrong */ |
422 | 0 | } |
423 | 0 |
|
424 | 0 | if (!first) { /* tail-end fixups */ |
425 | 0 | AHEAD(prevfwd); |
426 | 0 | ASTERN(O_CH, prevback); |
427 | 0 | } |
428 | 0 |
|
429 | 0 | assert(!MORE() || SEE(stop)); |
430 | 0 | } |
431 | | |
432 | | /* |
433 | | - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op |
434 | | */ |
435 | | static void |
436 | | p_ere_exp(struct parse *p) |
437 | 0 | { |
438 | 0 | char c; |
439 | 0 | sopno pos; |
440 | 0 | int count; |
441 | 0 | int count2; |
442 | 0 | int backrefnum; |
443 | 0 | sopno subno; |
444 | 0 | int wascaret = 0; |
445 | 0 |
|
446 | 0 | assert(MORE()); /* caller should have ensured this */ |
447 | 0 | c = GETNEXT(); |
448 | 0 |
|
449 | 0 | pos = HERE(); |
450 | 0 | switch (c) { |
451 | 0 | case '(': |
452 | 0 | REQUIRE(MORE(), REG_EPAREN); |
453 | 0 | p->g->nsub++; |
454 | 0 | subno = p->g->nsub; |
455 | 0 | if (subno < NPAREN) |
456 | 0 | p->pbegin[subno] = HERE(); |
457 | 0 | EMIT(OLPAREN, subno); |
458 | 0 | if (!SEE(')')) |
459 | 0 | p_ere(p, ')'); |
460 | 0 | if (subno < NPAREN) { |
461 | 0 | p->pend[subno] = HERE(); |
462 | 0 | assert(p->pend[subno] != 0); |
463 | 0 | } |
464 | 0 | EMIT(ORPAREN, subno); |
465 | 0 | MUSTEAT(')', REG_EPAREN); |
466 | 0 | break; |
467 | 0 | #ifndef POSIX_MISTAKE |
468 | 0 | case ')': /* happens only if no current unmatched ( */ |
469 | 0 | /* |
470 | 0 | * You may ask, why the ifndef? Because I didn't notice |
471 | 0 | * this until slightly too late for 1003.2, and none of the |
472 | 0 | * other 1003.2 regular-expression reviewers noticed it at |
473 | 0 | * all. So an unmatched ) is legal POSIX, at least until |
474 | 0 | * we can get it fixed. |
475 | 0 | */ |
476 | 0 | SETERROR(REG_EPAREN); |
477 | 0 | break; |
478 | 0 | #endif |
479 | 0 | case '^': |
480 | 0 | EMIT(OBOL, 0); |
481 | 0 | p->g->iflags |= USEBOL; |
482 | 0 | p->g->nbol++; |
483 | 0 | wascaret = 1; |
484 | 0 | break; |
485 | 0 | case '$': |
486 | 0 | EMIT(OEOL, 0); |
487 | 0 | p->g->iflags |= USEEOL; |
488 | 0 | p->g->neol++; |
489 | 0 | break; |
490 | 0 | case '|': |
491 | 0 | SETERROR(REG_EMPTY); |
492 | 0 | break; |
493 | 0 | case '*': |
494 | 0 | case '+': |
495 | 0 | case '?': |
496 | 0 | SETERROR(REG_BADRPT); |
497 | 0 | break; |
498 | 0 | case '.': |
499 | 0 | if (p->g->cflags®_NEWLINE) |
500 | 0 | nonnewline(p); |
501 | 0 | else |
502 | 0 | EMIT(OANY, 0); |
503 | 0 | break; |
504 | 0 | case '[': |
505 | 0 | p_bracket(p); |
506 | 0 | break; |
507 | 0 | case '\\': |
508 | 0 | REQUIRE(MORE(), REG_EESCAPE); |
509 | 0 | c = GETNEXT(); |
510 | 0 | if (c >= '1' && c <= '9') { |
511 | 0 | /* \[0-9] is taken to be a back-reference to a previously specified |
512 | 0 | * matching group. backrefnum will hold the number. The matching |
513 | 0 | * group must exist (i.e. if \4 is found there must have been at |
514 | 0 | * least 4 matching groups specified in the pattern previously). |
515 | 0 | */ |
516 | 0 | backrefnum = c - '0'; |
517 | 0 | if (p->pend[backrefnum] == 0) { |
518 | 0 | SETERROR(REG_ESUBREG); |
519 | 0 | break; |
520 | 0 | } |
521 | 0 |
|
522 | 0 | /* Make sure everything checks out and emit the sequence |
523 | 0 | * that marks a back-reference to the parse structure. |
524 | 0 | */ |
525 | 0 | assert(backrefnum <= p->g->nsub); |
526 | 0 | EMIT(OBACK_, backrefnum); |
527 | 0 | assert(p->pbegin[backrefnum] != 0); |
528 | 0 | assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN); |
529 | 0 | assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN); |
530 | 0 | (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]); |
531 | 0 | EMIT(O_BACK, backrefnum); |
532 | 0 | p->g->backrefs = 1; |
533 | 0 | } else { |
534 | 0 | /* Other chars are simply themselves when escaped with a backslash. |
535 | 0 | */ |
536 | 0 | ordinary(p, c); |
537 | 0 | } |
538 | 0 | break; |
539 | 0 | case '{': /* okay as ordinary except if digit follows */ |
540 | 0 | REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); |
541 | 0 | LLVM_FALLTHROUGH; |
542 | 0 | default: |
543 | 0 | ordinary(p, c); |
544 | 0 | break; |
545 | 0 | } |
546 | 0 | |
547 | 0 | if (!MORE()) |
548 | 0 | return; |
549 | 0 | c = PEEK(); |
550 | 0 | /* we call { a repetition if followed by a digit */ |
551 | 0 | if (!( c == '*' || c == '+' || c == '?' || |
552 | 0 | (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) |
553 | 0 | return; /* no repetition, we're done */ |
554 | 0 | NEXT(); |
555 | 0 |
|
556 | 0 | REQUIRE(!wascaret, REG_BADRPT); |
557 | 0 | switch (c) { |
558 | 0 | case '*': /* implemented as +? */ |
559 | 0 | /* this case does not require the (y|) trick, noKLUDGE */ |
560 | 0 | INSERT(OPLUS_, pos); |
561 | 0 | ASTERN(O_PLUS, pos); |
562 | 0 | INSERT(OQUEST_, pos); |
563 | 0 | ASTERN(O_QUEST, pos); |
564 | 0 | break; |
565 | 0 | case '+': |
566 | 0 | INSERT(OPLUS_, pos); |
567 | 0 | ASTERN(O_PLUS, pos); |
568 | 0 | break; |
569 | 0 | case '?': |
570 | 0 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
571 | 0 | INSERT(OCH_, pos); /* offset slightly wrong */ |
572 | 0 | ASTERN(OOR1, pos); /* this one's right */ |
573 | 0 | AHEAD(pos); /* fix the OCH_ */ |
574 | 0 | EMIT(OOR2, 0); /* offset very wrong... */ |
575 | 0 | AHEAD(THERE()); /* ...so fix it */ |
576 | 0 | ASTERN(O_CH, THERETHERE()); |
577 | 0 | break; |
578 | 0 | case '{': |
579 | 0 | count = p_count(p); |
580 | 0 | if (EAT(',')) { |
581 | 0 | if (isdigit((uch)PEEK())) { |
582 | 0 | count2 = p_count(p); |
583 | 0 | REQUIRE(count <= count2, REG_BADBR); |
584 | 0 | } else /* single number with comma */ |
585 | 0 | count2 = INFINITY; |
586 | 0 | } else /* just a single number */ |
587 | 0 | count2 = count; |
588 | 0 | repeat(p, pos, count, count2); |
589 | 0 | if (!EAT('}')) { /* error heuristics */ |
590 | 0 | while (MORE() && PEEK() != '}') |
591 | 0 | NEXT(); |
592 | 0 | REQUIRE(MORE(), REG_EBRACE); |
593 | 0 | SETERROR(REG_BADBR); |
594 | 0 | } |
595 | 0 | break; |
596 | 0 | } |
597 | 0 |
|
598 | 0 | if (!MORE()) |
599 | 0 | return; |
600 | 0 | c = PEEK(); |
601 | 0 | if (!( c == '*' || c == '+' || c == '?' || |
602 | 0 | (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) |
603 | 0 | return; |
604 | 0 | SETERROR(REG_BADRPT); |
605 | 0 | } |
606 | | |
607 | | /* |
608 | | - p_str - string (no metacharacters) "parser" |
609 | | */ |
610 | | static void |
611 | | p_str(struct parse *p) |
612 | 0 | { |
613 | 0 | REQUIRE(MORE(), REG_EMPTY); |
614 | 0 | while (MORE()) |
615 | 0 | ordinary(p, GETNEXT()); |
616 | 0 | } |
617 | | |
618 | | /* |
619 | | - p_bre - BRE parser top level, anchoring and concatenation |
620 | | * Giving end1 as OUT essentially eliminates the end1/end2 check. |
621 | | * |
622 | | * This implementation is a bit of a kludge, in that a trailing $ is first |
623 | | * taken as an ordinary character and then revised to be an anchor. The |
624 | | * only undesirable side effect is that '$' gets included as a character |
625 | | * category in such cases. This is fairly harmless; not worth fixing. |
626 | | * The amount of lookahead needed to avoid this kludge is excessive. |
627 | | */ |
628 | | static void |
629 | | p_bre(struct parse *p, |
630 | | int end1, /* first terminating character */ |
631 | | int end2) /* second terminating character */ |
632 | 0 | { |
633 | 0 | sopno start = HERE(); |
634 | 0 | int first = 1; /* first subexpression? */ |
635 | 0 | int wasdollar = 0; |
636 | 0 |
|
637 | 0 | if (EAT('^')) { |
638 | 0 | EMIT(OBOL, 0); |
639 | 0 | p->g->iflags |= USEBOL; |
640 | 0 | p->g->nbol++; |
641 | 0 | } |
642 | 0 | while (MORE() && !SEETWO(end1, end2)) { |
643 | 0 | wasdollar = p_simp_re(p, first); |
644 | 0 | first = 0; |
645 | 0 | } |
646 | 0 | if (wasdollar) { /* oops, that was a trailing anchor */ |
647 | 0 | DROP(1); |
648 | 0 | EMIT(OEOL, 0); |
649 | 0 | p->g->iflags |= USEEOL; |
650 | 0 | p->g->neol++; |
651 | 0 | } |
652 | 0 |
|
653 | 0 | REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ |
654 | 0 | } |
655 | | |
656 | | /* |
657 | | - p_simp_re - parse a simple RE, an atom possibly followed by a repetition |
658 | | */ |
659 | | static int /* was the simple RE an unbackslashed $? */ |
660 | | p_simp_re(struct parse *p, |
661 | | int starordinary) /* is a leading * an ordinary character? */ |
662 | 0 | { |
663 | 0 | int c; |
664 | 0 | int count; |
665 | 0 | int count2; |
666 | 0 | sopno pos; |
667 | 0 | int i; |
668 | 0 | sopno subno; |
669 | 0 | # define BACKSL (1<<CHAR_BIT) |
670 | 0 |
|
671 | 0 | pos = HERE(); /* repetition op, if any, covers from here */ |
672 | 0 |
|
673 | 0 | assert(MORE()); /* caller should have ensured this */ |
674 | 0 | c = GETNEXT(); |
675 | 0 | if (c == '\\') { |
676 | 0 | REQUIRE(MORE(), REG_EESCAPE); |
677 | 0 | c = BACKSL | GETNEXT(); |
678 | 0 | } |
679 | 0 | switch (c) { |
680 | 0 | case '.': |
681 | 0 | if (p->g->cflags®_NEWLINE) |
682 | 0 | nonnewline(p); |
683 | 0 | else |
684 | 0 | EMIT(OANY, 0); |
685 | 0 | break; |
686 | 0 | case '[': |
687 | 0 | p_bracket(p); |
688 | 0 | break; |
689 | 0 | case BACKSL|'{': |
690 | 0 | SETERROR(REG_BADRPT); |
691 | 0 | break; |
692 | 0 | case BACKSL|'(': |
693 | 0 | p->g->nsub++; |
694 | 0 | subno = p->g->nsub; |
695 | 0 | if (subno < NPAREN) |
696 | 0 | p->pbegin[subno] = HERE(); |
697 | 0 | EMIT(OLPAREN, subno); |
698 | 0 | /* the MORE here is an error heuristic */ |
699 | 0 | if (MORE() && !SEETWO('\\', ')')) |
700 | 0 | p_bre(p, '\\', ')'); |
701 | 0 | if (subno < NPAREN) { |
702 | 0 | p->pend[subno] = HERE(); |
703 | 0 | assert(p->pend[subno] != 0); |
704 | 0 | } |
705 | 0 | EMIT(ORPAREN, subno); |
706 | 0 | REQUIRE(EATTWO('\\', ')'), REG_EPAREN); |
707 | 0 | break; |
708 | 0 | case BACKSL|')': /* should not get here -- must be user */ |
709 | 0 | case BACKSL|'}': |
710 | 0 | SETERROR(REG_EPAREN); |
711 | 0 | break; |
712 | 0 | case BACKSL|'1': |
713 | 0 | case BACKSL|'2': |
714 | 0 | case BACKSL|'3': |
715 | 0 | case BACKSL|'4': |
716 | 0 | case BACKSL|'5': |
717 | 0 | case BACKSL|'6': |
718 | 0 | case BACKSL|'7': |
719 | 0 | case BACKSL|'8': |
720 | 0 | case BACKSL|'9': |
721 | 0 | i = (c&~BACKSL) - '0'; |
722 | 0 | assert(i < NPAREN); |
723 | 0 | if (p->pend[i] != 0) { |
724 | 0 | assert(i <= p->g->nsub); |
725 | 0 | EMIT(OBACK_, i); |
726 | 0 | assert(p->pbegin[i] != 0); |
727 | 0 | assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); |
728 | 0 | assert(OP(p->strip[p->pend[i]]) == ORPAREN); |
729 | 0 | (void) dupl(p, p->pbegin[i]+1, p->pend[i]); |
730 | 0 | EMIT(O_BACK, i); |
731 | 0 | } else |
732 | 0 | SETERROR(REG_ESUBREG); |
733 | 0 | p->g->backrefs = 1; |
734 | 0 | break; |
735 | 0 | case '*': |
736 | 0 | REQUIRE(starordinary, REG_BADRPT); |
737 | 0 | LLVM_FALLTHROUGH; |
738 | 0 | default: |
739 | 0 | ordinary(p, (char)c); |
740 | 0 | break; |
741 | 0 | } |
742 | 0 | |
743 | 0 | if (EAT('*')) { /* implemented as +? */ |
744 | 0 | /* this case does not require the (y|) trick, noKLUDGE */ |
745 | 0 | INSERT(OPLUS_, pos); |
746 | 0 | ASTERN(O_PLUS, pos); |
747 | 0 | INSERT(OQUEST_, pos); |
748 | 0 | ASTERN(O_QUEST, pos); |
749 | 0 | } else if (EATTWO('\\', '{')) { |
750 | 0 | count = p_count(p); |
751 | 0 | if (EAT(',')) { |
752 | 0 | if (MORE() && isdigit((uch)PEEK())) { |
753 | 0 | count2 = p_count(p); |
754 | 0 | REQUIRE(count <= count2, REG_BADBR); |
755 | 0 | } else /* single number with comma */ |
756 | 0 | count2 = INFINITY; |
757 | 0 | } else /* just a single number */ |
758 | 0 | count2 = count; |
759 | 0 | repeat(p, pos, count, count2); |
760 | 0 | if (!EATTWO('\\', '}')) { /* error heuristics */ |
761 | 0 | while (MORE() && !SEETWO('\\', '}')) |
762 | 0 | NEXT(); |
763 | 0 | REQUIRE(MORE(), REG_EBRACE); |
764 | 0 | SETERROR(REG_BADBR); |
765 | 0 | } |
766 | 0 | } else if (c == '$') /* $ (but not \$) ends it */ |
767 | 0 | return(1); |
768 | 0 | |
769 | 0 | return(0); |
770 | 0 | } |
771 | | |
772 | | /* |
773 | | - p_count - parse a repetition count |
774 | | */ |
775 | | static int /* the value */ |
776 | | p_count(struct parse *p) |
777 | 0 | { |
778 | 0 | int count = 0; |
779 | 0 | int ndigits = 0; |
780 | 0 |
|
781 | 0 | while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { |
782 | 0 | count = count*10 + (GETNEXT() - '0'); |
783 | 0 | ndigits++; |
784 | 0 | } |
785 | 0 |
|
786 | 0 | REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); |
787 | 0 | return(count); |
788 | 0 | } |
789 | | |
790 | | /* |
791 | | - p_bracket - parse a bracketed character list |
792 | | * |
793 | | * Note a significant property of this code: if the allocset() did SETERROR, |
794 | | * no set operations are done. |
795 | | */ |
796 | | static void |
797 | | p_bracket(struct parse *p) |
798 | 0 | { |
799 | 0 | cset *cs; |
800 | 0 | int invert = 0; |
801 | 0 |
|
802 | 0 | /* Dept of Truly Sickening Special-Case Kludges */ |
803 | 0 | if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { |
804 | 0 | EMIT(OBOW, 0); |
805 | 0 | NEXTn(6); |
806 | 0 | return; |
807 | 0 | } |
808 | 0 | if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { |
809 | 0 | EMIT(OEOW, 0); |
810 | 0 | NEXTn(6); |
811 | 0 | return; |
812 | 0 | } |
813 | 0 |
|
814 | 0 | if ((cs = allocset(p)) == NULL) { |
815 | 0 | /* allocset did set error status in p */ |
816 | 0 | return; |
817 | 0 | } |
818 | 0 | |
819 | 0 | if (EAT('^')) |
820 | 0 | invert++; /* make note to invert set at end */ |
821 | 0 | if (EAT(']')) |
822 | 0 | CHadd(cs, ']'); |
823 | 0 | else if (EAT('-')) |
824 | 0 | CHadd(cs, '-'); |
825 | 0 | while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) |
826 | 0 | p_b_term(p, cs); |
827 | 0 | if (EAT('-')) |
828 | 0 | CHadd(cs, '-'); |
829 | 0 | MUSTEAT(']', REG_EBRACK); |
830 | 0 |
|
831 | 0 | if (p->error != 0) { /* don't mess things up further */ |
832 | 0 | freeset(p, cs); |
833 | 0 | return; |
834 | 0 | } |
835 | 0 | |
836 | 0 | if (p->g->cflags®_ICASE) { |
837 | 0 | int i; |
838 | 0 | int ci; |
839 | 0 |
|
840 | 0 | for (i = p->g->csetsize - 1; i >= 0; i--) |
841 | 0 | if (CHIN(cs, i) && isalpha(i)) { |
842 | 0 | ci = othercase(i); |
843 | 0 | if (ci != i) |
844 | 0 | CHadd(cs, ci); |
845 | 0 | } |
846 | 0 | if (cs->multis != NULL) |
847 | 0 | mccase(p, cs); |
848 | 0 | } |
849 | 0 | if (invert) { |
850 | 0 | int i; |
851 | 0 |
|
852 | 0 | for (i = p->g->csetsize - 1; i >= 0; i--) |
853 | 0 | if (CHIN(cs, i)) |
854 | 0 | CHsub(cs, i); |
855 | 0 | else |
856 | 0 | CHadd(cs, i); |
857 | 0 | if (p->g->cflags®_NEWLINE) |
858 | 0 | CHsub(cs, '\n'); |
859 | 0 | if (cs->multis != NULL) |
860 | 0 | mcinvert(p, cs); |
861 | 0 | } |
862 | 0 |
|
863 | 0 | assert(cs->multis == NULL); /* xxx */ |
864 | 0 |
|
865 | 0 | if (nch(p, cs) == 1) { /* optimize singleton sets */ |
866 | 0 | ordinary(p, firstch(p, cs)); |
867 | 0 | freeset(p, cs); |
868 | 0 | } else |
869 | 0 | EMIT(OANYOF, freezeset(p, cs)); |
870 | 0 | } |
871 | | |
872 | | /* |
873 | | - p_b_term - parse one term of a bracketed character list |
874 | | */ |
875 | | static void |
876 | | p_b_term(struct parse *p, cset *cs) |
877 | 0 | { |
878 | 0 | char c; |
879 | 0 | char start, finish; |
880 | 0 | int i; |
881 | 0 |
|
882 | 0 | /* classify what we've got */ |
883 | 0 | switch ((MORE()) ? PEEK() : '\0') { |
884 | 0 | case '[': |
885 | 0 | c = (MORE2()) ? PEEK2() : '\0'; |
886 | 0 | break; |
887 | 0 | case '-': |
888 | 0 | SETERROR(REG_ERANGE); |
889 | 0 | return; /* NOTE RETURN */ |
890 | 0 | break; |
891 | 0 | default: |
892 | 0 | c = '\0'; |
893 | 0 | break; |
894 | 0 | } |
895 | 0 | |
896 | 0 | switch (c) { |
897 | 0 | case ':': /* character class */ |
898 | 0 | NEXT2(); |
899 | 0 | REQUIRE(MORE(), REG_EBRACK); |
900 | 0 | c = PEEK(); |
901 | 0 | REQUIRE(c != '-' && c != ']', REG_ECTYPE); |
902 | 0 | p_b_cclass(p, cs); |
903 | 0 | REQUIRE(MORE(), REG_EBRACK); |
904 | 0 | REQUIRE(EATTWO(':', ']'), REG_ECTYPE); |
905 | 0 | break; |
906 | 0 | case '=': /* equivalence class */ |
907 | 0 | NEXT2(); |
908 | 0 | REQUIRE(MORE(), REG_EBRACK); |
909 | 0 | c = PEEK(); |
910 | 0 | REQUIRE(c != '-' && c != ']', REG_ECOLLATE); |
911 | 0 | p_b_eclass(p, cs); |
912 | 0 | REQUIRE(MORE(), REG_EBRACK); |
913 | 0 | REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); |
914 | 0 | break; |
915 | 0 | default: /* symbol, ordinary character, or range */ |
916 | 0 | /* xxx revision needed for multichar stuff */ |
917 | 0 | start = p_b_symbol(p); |
918 | 0 | if (SEE('-') && MORE2() && PEEK2() != ']') { |
919 | 0 | /* range */ |
920 | 0 | NEXT(); |
921 | 0 | if (EAT('-')) |
922 | 0 | finish = '-'; |
923 | 0 | else |
924 | 0 | finish = p_b_symbol(p); |
925 | 0 | } else |
926 | 0 | finish = start; |
927 | 0 | /* xxx what about signed chars here... */ |
928 | 0 | REQUIRE(start <= finish, REG_ERANGE); |
929 | 0 | for (i = start; i <= finish; i++) |
930 | 0 | CHadd(cs, i); |
931 | 0 | break; |
932 | 0 | } |
933 | 0 | } |
934 | | |
935 | | /* |
936 | | - p_b_cclass - parse a character-class name and deal with it |
937 | | */ |
938 | | static void |
939 | | p_b_cclass(struct parse *p, cset *cs) |
940 | 0 | { |
941 | 0 | char *sp = p->next; |
942 | 0 | struct cclass *cp; |
943 | 0 | size_t len; |
944 | 0 | const char *u; |
945 | 0 | char c; |
946 | 0 |
|
947 | 0 | while (MORE() && isalpha((uch)PEEK())) |
948 | 0 | NEXT(); |
949 | 0 | len = p->next - sp; |
950 | 0 | for (cp = cclasses; cp->name != NULL; cp++) |
951 | 0 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') |
952 | 0 | break; |
953 | 0 | if (cp->name == NULL) { |
954 | 0 | /* oops, didn't find it */ |
955 | 0 | SETERROR(REG_ECTYPE); |
956 | 0 | return; |
957 | 0 | } |
958 | 0 |
|
959 | 0 | u = cp->chars; |
960 | 0 | while ((c = *u++) != '\0') |
961 | 0 | CHadd(cs, c); |
962 | 0 | for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) |
963 | 0 | MCadd(p, cs, u); |
964 | 0 | } |
965 | | |
966 | | /* |
967 | | - p_b_eclass - parse an equivalence-class name and deal with it |
968 | | * |
969 | | * This implementation is incomplete. xxx |
970 | | */ |
971 | | static void |
972 | | p_b_eclass(struct parse *p, cset *cs) |
973 | 0 | { |
974 | 0 | char c; |
975 | 0 |
|
976 | 0 | c = p_b_coll_elem(p, '='); |
977 | 0 | CHadd(cs, c); |
978 | 0 | } |
979 | | |
980 | | /* |
981 | | - p_b_symbol - parse a character or [..]ed multicharacter collating symbol |
982 | | */ |
983 | | static char /* value of symbol */ |
984 | | p_b_symbol(struct parse *p) |
985 | 0 | { |
986 | 0 | char value; |
987 | 0 |
|
988 | 0 | REQUIRE(MORE(), REG_EBRACK); |
989 | 0 | if (!EATTWO('[', '.')) |
990 | 0 | return(GETNEXT()); |
991 | 0 |
|
992 | 0 | /* collating symbol */ |
993 | 0 | value = p_b_coll_elem(p, '.'); |
994 | 0 | REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); |
995 | 0 | return(value); |
996 | 0 | } |
997 | | |
998 | | /* |
999 | | - p_b_coll_elem - parse a collating-element name and look it up |
1000 | | */ |
1001 | | static char /* value of collating element */ |
1002 | | p_b_coll_elem(struct parse *p, |
1003 | | int endc) /* name ended by endc,']' */ |
1004 | 0 | { |
1005 | 0 | char *sp = p->next; |
1006 | 0 | struct cname *cp; |
1007 | 0 | size_t len; |
1008 | 0 |
|
1009 | 0 | while (MORE() && !SEETWO(endc, ']')) |
1010 | 0 | NEXT(); |
1011 | 0 | if (!MORE()) { |
1012 | 0 | SETERROR(REG_EBRACK); |
1013 | 0 | return(0); |
1014 | 0 | } |
1015 | 0 | len = p->next - sp; |
1016 | 0 | for (cp = cnames; cp->name != NULL; cp++) |
1017 | 0 | if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) |
1018 | 0 | return(cp->code); /* known name */ |
1019 | 0 | if (len == 1) |
1020 | 0 | return(*sp); /* single character */ |
1021 | 0 | SETERROR(REG_ECOLLATE); /* neither */ |
1022 | 0 | return(0); |
1023 | 0 | } |
1024 | | |
1025 | | /* |
1026 | | - othercase - return the case counterpart of an alphabetic |
1027 | | */ |
1028 | | static char /* if no counterpart, return ch */ |
1029 | | othercase(int ch) |
1030 | 0 | { |
1031 | 0 | ch = (uch)ch; |
1032 | 0 | assert(isalpha(ch)); |
1033 | 0 | if (isupper(ch)) |
1034 | 0 | return ((uch)tolower(ch)); |
1035 | 0 | else if (islower(ch)) |
1036 | 0 | return ((uch)toupper(ch)); |
1037 | 0 | else /* peculiar, but could happen */ |
1038 | 0 | return(ch); |
1039 | 0 | } |
1040 | | |
1041 | | /* |
1042 | | - bothcases - emit a dualcase version of a two-case character |
1043 | | * |
1044 | | * Boy, is this implementation ever a kludge... |
1045 | | */ |
1046 | | static void |
1047 | | bothcases(struct parse *p, int ch) |
1048 | 0 | { |
1049 | 0 | char *oldnext = p->next; |
1050 | 0 | char *oldend = p->end; |
1051 | 0 | char bracket[3]; |
1052 | 0 |
|
1053 | 0 | ch = (uch)ch; |
1054 | 0 | assert(othercase(ch) != ch); /* p_bracket() would recurse */ |
1055 | 0 | p->next = bracket; |
1056 | 0 | p->end = bracket+2; |
1057 | 0 | bracket[0] = ch; |
1058 | 0 | bracket[1] = ']'; |
1059 | 0 | bracket[2] = '\0'; |
1060 | 0 | p_bracket(p); |
1061 | 0 | assert(p->next == bracket+2); |
1062 | 0 | p->next = oldnext; |
1063 | 0 | p->end = oldend; |
1064 | 0 | } |
1065 | | |
1066 | | /* |
1067 | | - ordinary - emit an ordinary character |
1068 | | */ |
1069 | | static void |
1070 | | ordinary(struct parse *p, int ch) |
1071 | 0 | { |
1072 | 0 | cat_t *cap = p->g->categories; |
1073 | 0 |
|
1074 | 0 | if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) |
1075 | 0 | bothcases(p, ch); |
1076 | 0 | else { |
1077 | 0 | EMIT(OCHAR, (uch)ch); |
1078 | 0 | if (cap[ch] == 0) |
1079 | 0 | cap[ch] = p->g->ncategories++; |
1080 | 0 | } |
1081 | 0 | } |
1082 | | |
1083 | | /* |
1084 | | - nonnewline - emit REG_NEWLINE version of OANY |
1085 | | * |
1086 | | * Boy, is this implementation ever a kludge... |
1087 | | */ |
1088 | | static void |
1089 | | nonnewline(struct parse *p) |
1090 | 0 | { |
1091 | 0 | char *oldnext = p->next; |
1092 | 0 | char *oldend = p->end; |
1093 | 0 | char bracket[4]; |
1094 | 0 |
|
1095 | 0 | p->next = bracket; |
1096 | 0 | p->end = bracket+3; |
1097 | 0 | bracket[0] = '^'; |
1098 | 0 | bracket[1] = '\n'; |
1099 | 0 | bracket[2] = ']'; |
1100 | 0 | bracket[3] = '\0'; |
1101 | 0 | p_bracket(p); |
1102 | 0 | assert(p->next == bracket+3); |
1103 | 0 | p->next = oldnext; |
1104 | 0 | p->end = oldend; |
1105 | 0 | } |
1106 | | |
1107 | | /* |
1108 | | - repeat - generate code for a bounded repetition, recursively if needed |
1109 | | */ |
1110 | | static void |
1111 | | repeat(struct parse *p, |
1112 | | sopno start, /* operand from here to end of strip */ |
1113 | | int from, /* repeated from this number */ |
1114 | | int to) /* to this number of times (maybe INFINITY) */ |
1115 | 0 | { |
1116 | 0 | sopno finish = HERE(); |
1117 | 0 | # define N 2 |
1118 | 0 | # define INF 3 |
1119 | 0 | # define REP(f, t) ((f)*8 + (t)) |
1120 | 0 | # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) |
1121 | 0 | sopno copy; |
1122 | 0 |
|
1123 | 0 | if (p->error != 0) /* head off possible runaway recursion */ |
1124 | 0 | return; |
1125 | 0 | |
1126 | 0 | assert(from <= to); |
1127 | 0 |
|
1128 | 0 | switch (REP(MAP(from), MAP(to))) { |
1129 | 0 | case REP(0, 0): /* must be user doing this */ |
1130 | 0 | DROP(finish-start); /* drop the operand */ |
1131 | 0 | break; |
1132 | 0 | case REP(0, 1): /* as x{1,1}? */ |
1133 | 0 | case REP(0, N): /* as x{1,n}? */ |
1134 | 0 | case REP(0, INF): /* as x{1,}? */ |
1135 | 0 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
1136 | 0 | INSERT(OCH_, start); /* offset is wrong... */ |
1137 | 0 | repeat(p, start+1, 1, to); |
1138 | 0 | ASTERN(OOR1, start); |
1139 | 0 | AHEAD(start); /* ... fix it */ |
1140 | 0 | EMIT(OOR2, 0); |
1141 | 0 | AHEAD(THERE()); |
1142 | 0 | ASTERN(O_CH, THERETHERE()); |
1143 | 0 | break; |
1144 | 0 | case REP(1, 1): /* trivial case */ |
1145 | 0 | /* done */ |
1146 | 0 | break; |
1147 | 0 | case REP(1, N): /* as x?x{1,n-1} */ |
1148 | 0 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
1149 | 0 | INSERT(OCH_, start); |
1150 | 0 | ASTERN(OOR1, start); |
1151 | 0 | AHEAD(start); |
1152 | 0 | EMIT(OOR2, 0); /* offset very wrong... */ |
1153 | 0 | AHEAD(THERE()); /* ...so fix it */ |
1154 | 0 | ASTERN(O_CH, THERETHERE()); |
1155 | 0 | copy = dupl(p, start+1, finish+1); |
1156 | 0 | assert(copy == finish+4); |
1157 | 0 | repeat(p, copy, 1, to-1); |
1158 | 0 | break; |
1159 | 0 | case REP(1, INF): /* as x+ */ |
1160 | 0 | INSERT(OPLUS_, start); |
1161 | 0 | ASTERN(O_PLUS, start); |
1162 | 0 | break; |
1163 | 0 | case REP(N, N): /* as xx{m-1,n-1} */ |
1164 | 0 | copy = dupl(p, start, finish); |
1165 | 0 | repeat(p, copy, from-1, to-1); |
1166 | 0 | break; |
1167 | 0 | case REP(N, INF): /* as xx{n-1,INF} */ |
1168 | 0 | copy = dupl(p, start, finish); |
1169 | 0 | repeat(p, copy, from-1, to); |
1170 | 0 | break; |
1171 | 0 | default: /* "can't happen" */ |
1172 | 0 | SETERROR(REG_ASSERT); /* just in case */ |
1173 | 0 | break; |
1174 | 0 | } |
1175 | 0 | } |
1176 | | |
1177 | | /* |
1178 | | - seterr - set an error condition |
1179 | | */ |
1180 | | static int /* useless but makes type checking happy */ |
1181 | | seterr(struct parse *p, int e) |
1182 | 0 | { |
1183 | 0 | if (p->error == 0) /* keep earliest error condition */ |
1184 | 0 | p->error = e; |
1185 | 0 | p->next = nuls; /* try to bring things to a halt */ |
1186 | 0 | p->end = nuls; |
1187 | 0 | return(0); /* make the return value well-defined */ |
1188 | 0 | } |
1189 | | |
1190 | | /* |
1191 | | - allocset - allocate a set of characters for [] |
1192 | | */ |
1193 | | static cset * |
1194 | | allocset(struct parse *p) |
1195 | 0 | { |
1196 | 0 | int no = p->g->ncsets++; |
1197 | 0 | size_t nc; |
1198 | 0 | size_t nbytes; |
1199 | 0 | cset *cs; |
1200 | 0 | size_t css = (size_t)p->g->csetsize; |
1201 | 0 | int i; |
1202 | 0 |
|
1203 | 0 | if (no >= p->ncsalloc) { /* need another column of space */ |
1204 | 0 | void *ptr; |
1205 | 0 |
|
1206 | 0 | p->ncsalloc += CHAR_BIT; |
1207 | 0 | nc = p->ncsalloc; |
1208 | 0 | if (nc > SIZE_MAX / sizeof(cset)) |
1209 | 0 | goto nomem; |
1210 | 0 | assert(nc % CHAR_BIT == 0); |
1211 | 0 | nbytes = nc / CHAR_BIT * css; |
1212 | 0 |
|
1213 | 0 | ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); |
1214 | 0 | if (ptr == NULL) |
1215 | 0 | goto nomem; |
1216 | 0 | p->g->sets = ptr; |
1217 | 0 |
|
1218 | 0 | ptr = (uch *)realloc((char *)p->g->setbits, nbytes); |
1219 | 0 | if (ptr == NULL) |
1220 | 0 | goto nomem; |
1221 | 0 | p->g->setbits = ptr; |
1222 | 0 |
|
1223 | 0 | for (i = 0; i < no; i++) |
1224 | 0 | p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); |
1225 | 0 |
|
1226 | 0 | (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); |
1227 | 0 | } |
1228 | 0 | /* XXX should not happen */ |
1229 | 0 | if (p->g->sets == NULL || p->g->setbits == NULL) |
1230 | 0 | goto nomem; |
1231 | 0 | |
1232 | 0 | cs = &p->g->sets[no]; |
1233 | 0 | cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); |
1234 | 0 | cs->mask = 1 << ((no) % CHAR_BIT); |
1235 | 0 | cs->hash = 0; |
1236 | 0 | cs->smultis = 0; |
1237 | 0 | cs->multis = NULL; |
1238 | 0 |
|
1239 | 0 | return(cs); |
1240 | 0 | nomem: |
1241 | 0 | free(p->g->sets); |
1242 | 0 | p->g->sets = NULL; |
1243 | 0 | free(p->g->setbits); |
1244 | 0 | p->g->setbits = NULL; |
1245 | 0 |
|
1246 | 0 | SETERROR(REG_ESPACE); |
1247 | 0 | /* caller's responsibility not to do set ops */ |
1248 | 0 | return(NULL); |
1249 | 0 | } |
1250 | | |
1251 | | /* |
1252 | | - freeset - free a now-unused set |
1253 | | */ |
1254 | | static void |
1255 | | freeset(struct parse *p, cset *cs) |
1256 | 0 | { |
1257 | 0 | size_t i; |
1258 | 0 | cset *top = &p->g->sets[p->g->ncsets]; |
1259 | 0 | size_t css = (size_t)p->g->csetsize; |
1260 | 0 |
|
1261 | 0 | for (i = 0; i < css; i++) |
1262 | 0 | CHsub(cs, i); |
1263 | 0 | if (cs == top-1) /* recover only the easy case */ |
1264 | 0 | p->g->ncsets--; |
1265 | 0 | } |
1266 | | |
1267 | | /* |
1268 | | - freezeset - final processing on a set of characters |
1269 | | * |
1270 | | * The main task here is merging identical sets. This is usually a waste |
1271 | | * of time (although the hash code minimizes the overhead), but can win |
1272 | | * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash |
1273 | | * is done using addition rather than xor -- all ASCII [aA] sets xor to |
1274 | | * the same value! |
1275 | | */ |
1276 | | static int /* set number */ |
1277 | | freezeset(struct parse *p, cset *cs) |
1278 | 0 | { |
1279 | 0 | uch h = cs->hash; |
1280 | 0 | size_t i; |
1281 | 0 | cset *top = &p->g->sets[p->g->ncsets]; |
1282 | 0 | cset *cs2; |
1283 | 0 | size_t css = (size_t)p->g->csetsize; |
1284 | 0 |
|
1285 | 0 | /* look for an earlier one which is the same */ |
1286 | 0 | for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) |
1287 | 0 | if (cs2->hash == h && cs2 != cs) { |
1288 | 0 | /* maybe */ |
1289 | 0 | for (i = 0; i < css; i++) |
1290 | 0 | if (!!CHIN(cs2, i) != !!CHIN(cs, i)) |
1291 | 0 | break; /* no */ |
1292 | 0 | if (i == css) |
1293 | 0 | break; /* yes */ |
1294 | 0 | } |
1295 | 0 |
|
1296 | 0 | if (cs2 < top) { /* found one */ |
1297 | 0 | freeset(p, cs); |
1298 | 0 | cs = cs2; |
1299 | 0 | } |
1300 | 0 |
|
1301 | 0 | return((int)(cs - p->g->sets)); |
1302 | 0 | } |
1303 | | |
1304 | | /* |
1305 | | - firstch - return first character in a set (which must have at least one) |
1306 | | */ |
1307 | | static int /* character; there is no "none" value */ |
1308 | | firstch(struct parse *p, cset *cs) |
1309 | 0 | { |
1310 | 0 | size_t i; |
1311 | 0 | size_t css = (size_t)p->g->csetsize; |
1312 | 0 |
|
1313 | 0 | for (i = 0; i < css; i++) |
1314 | 0 | if (CHIN(cs, i)) |
1315 | 0 | return((char)i); |
1316 | 0 | assert(never); |
1317 | 0 | return(0); /* arbitrary */ |
1318 | 0 | } |
1319 | | |
1320 | | /* |
1321 | | - nch - number of characters in a set |
1322 | | */ |
1323 | | static int |
1324 | | nch(struct parse *p, cset *cs) |
1325 | 0 | { |
1326 | 0 | size_t i; |
1327 | 0 | size_t css = (size_t)p->g->csetsize; |
1328 | 0 | int n = 0; |
1329 | 0 |
|
1330 | 0 | for (i = 0; i < css; i++) |
1331 | 0 | if (CHIN(cs, i)) |
1332 | 0 | n++; |
1333 | 0 | return(n); |
1334 | 0 | } |
1335 | | |
1336 | | /* |
1337 | | - mcadd - add a collating element to a cset |
1338 | | */ |
1339 | | static void |
1340 | | mcadd( struct parse *p, cset *cs, const char *cp) |
1341 | 0 | { |
1342 | 0 | size_t oldend = cs->smultis; |
1343 | 0 | void *np; |
1344 | 0 |
|
1345 | 0 | cs->smultis += strlen(cp) + 1; |
1346 | 0 | np = realloc(cs->multis, cs->smultis); |
1347 | 0 | if (np == NULL) { |
1348 | 0 | if (cs->multis) |
1349 | 0 | free(cs->multis); |
1350 | 0 | cs->multis = NULL; |
1351 | 0 | SETERROR(REG_ESPACE); |
1352 | 0 | return; |
1353 | 0 | } |
1354 | 0 | cs->multis = np; |
1355 | 0 |
|
1356 | 0 | llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); |
1357 | 0 | } |
1358 | | |
1359 | | /* |
1360 | | - mcinvert - invert the list of collating elements in a cset |
1361 | | * |
1362 | | * This would have to know the set of possibilities. Implementation |
1363 | | * is deferred. |
1364 | | */ |
1365 | | /* ARGSUSED */ |
1366 | | static void |
1367 | | mcinvert(struct parse *p, cset *cs) |
1368 | 0 | { |
1369 | 0 | assert(cs->multis == NULL); /* xxx */ |
1370 | 0 | } |
1371 | | |
1372 | | /* |
1373 | | - mccase - add case counterparts of the list of collating elements in a cset |
1374 | | * |
1375 | | * This would have to know the set of possibilities. Implementation |
1376 | | * is deferred. |
1377 | | */ |
1378 | | /* ARGSUSED */ |
1379 | | static void |
1380 | | mccase(struct parse *p, cset *cs) |
1381 | 0 | { |
1382 | 0 | assert(cs->multis == NULL); /* xxx */ |
1383 | 0 | } |
1384 | | |
1385 | | /* |
1386 | | - isinsets - is this character in any sets? |
1387 | | */ |
1388 | | static int /* predicate */ |
1389 | | isinsets(struct re_guts *g, int c) |
1390 | 0 | { |
1391 | 0 | uch *col; |
1392 | 0 | int i; |
1393 | 0 | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
1394 | 0 | unsigned uc = (uch)c; |
1395 | 0 |
|
1396 | 0 | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) |
1397 | 0 | if (col[uc] != 0) |
1398 | 0 | return(1); |
1399 | 0 | return(0); |
1400 | 0 | } |
1401 | | |
1402 | | /* |
1403 | | - samesets - are these two characters in exactly the same sets? |
1404 | | */ |
1405 | | static int /* predicate */ |
1406 | | samesets(struct re_guts *g, int c1, int c2) |
1407 | 0 | { |
1408 | 0 | uch *col; |
1409 | 0 | int i; |
1410 | 0 | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
1411 | 0 | unsigned uc1 = (uch)c1; |
1412 | 0 | unsigned uc2 = (uch)c2; |
1413 | 0 |
|
1414 | 0 | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) |
1415 | 0 | if (col[uc1] != col[uc2]) |
1416 | 0 | return(0); |
1417 | 0 | return(1); |
1418 | 0 | } |
1419 | | |
1420 | | /* |
1421 | | - categorize - sort out character categories |
1422 | | */ |
1423 | | static void |
1424 | | categorize(struct parse *p, struct re_guts *g) |
1425 | 0 | { |
1426 | 0 | cat_t *cats = g->categories; |
1427 | 0 | int c; |
1428 | 0 | int c2; |
1429 | 0 | cat_t cat; |
1430 | 0 |
|
1431 | 0 | /* avoid making error situations worse */ |
1432 | 0 | if (p->error != 0) |
1433 | 0 | return; |
1434 | 0 | |
1435 | 0 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) |
1436 | 0 | if (cats[c] == 0 && isinsets(g, c)) { |
1437 | 0 | cat = g->ncategories++; |
1438 | 0 | cats[c] = cat; |
1439 | 0 | for (c2 = c+1; c2 <= CHAR_MAX; c2++) |
1440 | 0 | if (cats[c2] == 0 && samesets(g, c, c2)) |
1441 | 0 | cats[c2] = cat; |
1442 | 0 | } |
1443 | 0 | } |
1444 | | |
1445 | | /* |
1446 | | - dupl - emit a duplicate of a bunch of sops |
1447 | | */ |
1448 | | static sopno /* start of duplicate */ |
1449 | | dupl(struct parse *p, |
1450 | | sopno start, /* from here */ |
1451 | | sopno finish) /* to this less one */ |
1452 | 0 | { |
1453 | 0 | sopno ret = HERE(); |
1454 | 0 | sopno len = finish - start; |
1455 | 0 |
|
1456 | 0 | assert(finish >= start); |
1457 | 0 | if (len == 0) |
1458 | 0 | return(ret); |
1459 | 0 | enlarge(p, p->ssize + len); /* this many unexpected additions */ |
1460 | 0 | assert(p->ssize >= p->slen + len); |
1461 | 0 | (void) memmove((char *)(p->strip + p->slen), |
1462 | 0 | (char *)(p->strip + start), (size_t)len*sizeof(sop)); |
1463 | 0 | p->slen += len; |
1464 | 0 | return(ret); |
1465 | 0 | } |
1466 | | |
1467 | | /* |
1468 | | - doemit - emit a strip operator |
1469 | | * |
1470 | | * It might seem better to implement this as a macro with a function as |
1471 | | * hard-case backup, but it's just too big and messy unless there are |
1472 | | * some changes to the data structures. Maybe later. |
1473 | | */ |
1474 | | static void |
1475 | | doemit(struct parse *p, sop op, size_t opnd) |
1476 | 0 | { |
1477 | 0 | /* avoid making error situations worse */ |
1478 | 0 | if (p->error != 0) |
1479 | 0 | return; |
1480 | 0 | |
1481 | 0 | /* deal with oversize operands ("can't happen", more or less) */ |
1482 | 0 | assert(opnd < 1<<OPSHIFT); |
1483 | 0 |
|
1484 | 0 | /* deal with undersized strip */ |
1485 | 0 | if (p->slen >= p->ssize) |
1486 | 0 | enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ |
1487 | 0 | assert(p->slen < p->ssize); |
1488 | 0 |
|
1489 | 0 | /* finally, it's all reduced to the easy case */ |
1490 | 0 | p->strip[p->slen++] = SOP(op, opnd); |
1491 | 0 | } |
1492 | | |
1493 | | /* |
1494 | | - doinsert - insert a sop into the strip |
1495 | | */ |
1496 | | static void |
1497 | | doinsert(struct parse *p, sop op, size_t opnd, sopno pos) |
1498 | 0 | { |
1499 | 0 | sopno sn; |
1500 | 0 | sop s; |
1501 | 0 | int i; |
1502 | 0 |
|
1503 | 0 | /* avoid making error situations worse */ |
1504 | 0 | if (p->error != 0) |
1505 | 0 | return; |
1506 | 0 | |
1507 | 0 | sn = HERE(); |
1508 | 0 | EMIT(op, opnd); /* do checks, ensure space */ |
1509 | 0 | assert(HERE() == sn+1); |
1510 | 0 | s = p->strip[sn]; |
1511 | 0 |
|
1512 | 0 | /* adjust paren pointers */ |
1513 | 0 | assert(pos > 0); |
1514 | 0 | for (i = 1; i < NPAREN; i++) { |
1515 | 0 | if (p->pbegin[i] >= pos) { |
1516 | 0 | p->pbegin[i]++; |
1517 | 0 | } |
1518 | 0 | if (p->pend[i] >= pos) { |
1519 | 0 | p->pend[i]++; |
1520 | 0 | } |
1521 | 0 | } |
1522 | 0 |
|
1523 | 0 | memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], |
1524 | 0 | (HERE()-pos-1)*sizeof(sop)); |
1525 | 0 | p->strip[pos] = s; |
1526 | 0 | } |
1527 | | |
1528 | | /* |
1529 | | - dofwd - complete a forward reference |
1530 | | */ |
1531 | | static void |
1532 | | dofwd(struct parse *p, sopno pos, sop value) |
1533 | 0 | { |
1534 | 0 | /* avoid making error situations worse */ |
1535 | 0 | if (p->error != 0) |
1536 | 0 | return; |
1537 | 0 | |
1538 | 0 | assert(value < 1<<OPSHIFT); |
1539 | 0 | p->strip[pos] = OP(p->strip[pos]) | value; |
1540 | 0 | } |
1541 | | |
1542 | | /* |
1543 | | - enlarge - enlarge the strip |
1544 | | */ |
1545 | | static void |
1546 | | enlarge(struct parse *p, sopno size) |
1547 | 0 | { |
1548 | 0 | sop *sp; |
1549 | 0 |
|
1550 | 0 | if (p->ssize >= size) |
1551 | 0 | return; |
1552 | 0 | |
1553 | 0 | if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) { |
1554 | 0 | SETERROR(REG_ESPACE); |
1555 | 0 | return; |
1556 | 0 | } |
1557 | 0 |
|
1558 | 0 | sp = (sop *)realloc(p->strip, size*sizeof(sop)); |
1559 | 0 | if (sp == NULL) { |
1560 | 0 | SETERROR(REG_ESPACE); |
1561 | 0 | return; |
1562 | 0 | } |
1563 | 0 | p->strip = sp; |
1564 | 0 | p->ssize = size; |
1565 | 0 | } |
1566 | | |
1567 | | /* |
1568 | | - stripsnug - compact the strip |
1569 | | */ |
1570 | | static void |
1571 | | stripsnug(struct parse *p, struct re_guts *g) |
1572 | 0 | { |
1573 | 0 | g->nstates = p->slen; |
1574 | 0 | if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) { |
1575 | 0 | g->strip = p->strip; |
1576 | 0 | SETERROR(REG_ESPACE); |
1577 | 0 | return; |
1578 | 0 | } |
1579 | 0 |
|
1580 | 0 | g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); |
1581 | 0 | if (g->strip == NULL) { |
1582 | 0 | SETERROR(REG_ESPACE); |
1583 | 0 | g->strip = p->strip; |
1584 | 0 | } |
1585 | 0 | } |
1586 | | |
1587 | | /* |
1588 | | - findmust - fill in must and mlen with longest mandatory literal string |
1589 | | * |
1590 | | * This algorithm could do fancy things like analyzing the operands of | |
1591 | | * for common subsequences. Someday. This code is simple and finds most |
1592 | | * of the interesting cases. |
1593 | | * |
1594 | | * Note that must and mlen got initialized during setup. |
1595 | | */ |
1596 | | static void |
1597 | | findmust(struct parse *p, struct re_guts *g) |
1598 | 0 | { |
1599 | 0 | sop *scan; |
1600 | 0 | sop *start = 0; /* start initialized in the default case, after that */ |
1601 | 0 | sop *newstart = 0; /* newstart was initialized in the OCHAR case */ |
1602 | 0 | sopno newlen; |
1603 | 0 | sop s; |
1604 | 0 | char *cp; |
1605 | 0 | sopno i; |
1606 | 0 |
|
1607 | 0 | /* avoid making error situations worse */ |
1608 | 0 | if (p->error != 0) |
1609 | 0 | return; |
1610 | 0 | |
1611 | 0 | /* find the longest OCHAR sequence in strip */ |
1612 | 0 | newlen = 0; |
1613 | 0 | scan = g->strip + 1; |
1614 | 0 | do { |
1615 | 0 | s = *scan++; |
1616 | 0 | switch (OP(s)) { |
1617 | 0 | case OCHAR: /* sequence member */ |
1618 | 0 | if (newlen == 0) /* new sequence */ |
1619 | 0 | newstart = scan - 1; |
1620 | 0 | newlen++; |
1621 | 0 | break; |
1622 | 0 | case OPLUS_: /* things that don't break one */ |
1623 | 0 | case OLPAREN: |
1624 | 0 | case ORPAREN: |
1625 | 0 | break; |
1626 | 0 | case OQUEST_: /* things that must be skipped */ |
1627 | 0 | case OCH_: |
1628 | 0 | scan--; |
1629 | 0 | do { |
1630 | 0 | scan += OPND(s); |
1631 | 0 | s = *scan; |
1632 | 0 | /* assert() interferes w debug printouts */ |
1633 | 0 | if (OP(s) != O_QUEST && OP(s) != O_CH && |
1634 | 0 | OP(s) != OOR2) { |
1635 | 0 | g->iflags |= REGEX_BAD; |
1636 | 0 | return; |
1637 | 0 | } |
1638 | 0 | } while (OP(s) != O_QUEST && OP(s) != O_CH); |
1639 | 0 | LLVM_FALLTHROUGH; |
1640 | 0 | default: /* things that break a sequence */ |
1641 | 0 | if (newlen > g->mlen) { /* ends one */ |
1642 | 0 | start = newstart; |
1643 | 0 | g->mlen = newlen; |
1644 | 0 | } |
1645 | 0 | newlen = 0; |
1646 | 0 | break; |
1647 | 0 | } |
1648 | 0 | } while (OP(s) != OEND); |
1649 | 0 |
|
1650 | 0 | if (g->mlen == 0) /* there isn't one */ |
1651 | 0 | return; |
1652 | 0 | |
1653 | 0 | /* turn it into a character string */ |
1654 | 0 | g->must = malloc((size_t)g->mlen + 1); |
1655 | 0 | if (g->must == NULL) { /* argh; just forget it */ |
1656 | 0 | g->mlen = 0; |
1657 | 0 | return; |
1658 | 0 | } |
1659 | 0 | cp = g->must; |
1660 | 0 | scan = start; |
1661 | 0 | for (i = g->mlen; i > 0; i--) { |
1662 | 0 | while (OP(s = *scan++) != OCHAR) |
1663 | 0 | continue; |
1664 | 0 | assert(cp < g->must + g->mlen); |
1665 | 0 | *cp++ = (char)OPND(s); |
1666 | 0 | } |
1667 | 0 | assert(cp == g->must + g->mlen); |
1668 | 0 | *cp++ = '\0'; /* just on general principles */ |
1669 | 0 | } |
1670 | | |
1671 | | /* |
1672 | | - pluscount - count + nesting |
1673 | | */ |
1674 | | static sopno /* nesting depth */ |
1675 | | pluscount(struct parse *p, struct re_guts *g) |
1676 | 0 | { |
1677 | 0 | sop *scan; |
1678 | 0 | sop s; |
1679 | 0 | sopno plusnest = 0; |
1680 | 0 | sopno maxnest = 0; |
1681 | 0 |
|
1682 | 0 | if (p->error != 0) |
1683 | 0 | return(0); /* there may not be an OEND */ |
1684 | 0 | |
1685 | 0 | scan = g->strip + 1; |
1686 | 0 | do { |
1687 | 0 | s = *scan++; |
1688 | 0 | switch (OP(s)) { |
1689 | 0 | case OPLUS_: |
1690 | 0 | plusnest++; |
1691 | 0 | break; |
1692 | 0 | case O_PLUS: |
1693 | 0 | if (plusnest > maxnest) |
1694 | 0 | maxnest = plusnest; |
1695 | 0 | plusnest--; |
1696 | 0 | break; |
1697 | 0 | } |
1698 | 0 | } while (OP(s) != OEND); |
1699 | 0 | if (plusnest != 0) |
1700 | 0 | g->iflags |= REGEX_BAD; |
1701 | 0 | return(maxnest); |
1702 | 0 | } |