/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Support/regcomp.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*- |
2 | | * This code is derived from OpenBSD's libc/regex, original license follows: |
3 | | * |
4 | | * Copyright (c) 1992, 1993, 1994 Henry Spencer. |
5 | | * Copyright (c) 1992, 1993, 1994 |
6 | | * The Regents of the University of California. All rights reserved. |
7 | | * |
8 | | * This code is derived from software contributed to Berkeley by |
9 | | * Henry Spencer. |
10 | | * |
11 | | * Redistribution and use in source and binary forms, with or without |
12 | | * modification, are permitted provided that the following conditions |
13 | | * are met: |
14 | | * 1. Redistributions of source code must retain the above copyright |
15 | | * notice, this list of conditions and the following disclaimer. |
16 | | * 2. Redistributions in binary form must reproduce the above copyright |
17 | | * notice, this list of conditions and the following disclaimer in the |
18 | | * documentation and/or other materials provided with the distribution. |
19 | | * 3. Neither the name of the University nor the names of its contributors |
20 | | * may be used to endorse or promote products derived from this software |
21 | | * without specific prior written permission. |
22 | | * |
23 | | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
24 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
25 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
26 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
27 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
28 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
29 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
30 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
31 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
32 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
33 | | * SUCH DAMAGE. |
34 | | * |
35 | | * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 |
36 | | */ |
37 | | |
38 | | #include <sys/types.h> |
39 | | #include <stdio.h> |
40 | | #include <string.h> |
41 | | #include <ctype.h> |
42 | | #include <limits.h> |
43 | | #include <stdlib.h> |
44 | | #include "regex_impl.h" |
45 | | |
46 | | #include "regutils.h" |
47 | | #include "regex2.h" |
48 | | |
49 | | #include "regcclass.h" |
50 | | #include "regcname.h" |
51 | | |
52 | | #include "llvm/Config/config.h" |
53 | | #if HAVE_STDINT_H |
54 | | #include <stdint.h> |
55 | | #else |
56 | | /* Pessimistically bound memory use */ |
57 | | #define SIZE_MAX UINT_MAX |
58 | | #endif |
59 | | |
60 | | /* |
61 | | * parse structure, passed up and down to avoid global variables and |
62 | | * other clumsinesses |
63 | | */ |
64 | | struct parse { |
65 | | char *next; /* next character in RE */ |
66 | | char *end; /* end of string (-> NUL normally) */ |
67 | | int error; /* has an error been seen? */ |
68 | | sop *strip; /* malloced strip */ |
69 | | sopno ssize; /* malloced strip size (allocated) */ |
70 | | sopno slen; /* malloced strip length (used) */ |
71 | | int ncsalloc; /* number of csets allocated */ |
72 | | struct re_guts *g; |
73 | 44.0M | # define NPAREN 10 /* we need to remember () 1-9 for back refs */ |
74 | | sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ |
75 | | sopno pend[NPAREN]; /* -> ) ([0] unused) */ |
76 | | }; |
77 | | |
78 | | static void p_ere(struct parse *, int); |
79 | | static void p_ere_exp(struct parse *); |
80 | | static void p_str(struct parse *); |
81 | | static void p_bre(struct parse *, int, int); |
82 | | static int p_simp_re(struct parse *, int); |
83 | | static int p_count(struct parse *); |
84 | | static void p_bracket(struct parse *); |
85 | | static void p_b_term(struct parse *, cset *); |
86 | | static void p_b_cclass(struct parse *, cset *); |
87 | | static void p_b_eclass(struct parse *, cset *); |
88 | | static char p_b_symbol(struct parse *); |
89 | | static char p_b_coll_elem(struct parse *, int); |
90 | | static char othercase(int); |
91 | | static void bothcases(struct parse *, int); |
92 | | static void ordinary(struct parse *, int); |
93 | | static void nonnewline(struct parse *); |
94 | | static void repeat(struct parse *, sopno, int, int); |
95 | | static int seterr(struct parse *, int); |
96 | | static cset *allocset(struct parse *); |
97 | | static void freeset(struct parse *, cset *); |
98 | | static int freezeset(struct parse *, cset *); |
99 | | static int firstch(struct parse *, cset *); |
100 | | static int nch(struct parse *, cset *); |
101 | | static void mcadd(struct parse *, cset *, const char *); |
102 | | static void mcinvert(struct parse *, cset *); |
103 | | static void mccase(struct parse *, cset *); |
104 | | static int isinsets(struct re_guts *, int); |
105 | | static int samesets(struct re_guts *, int, int); |
106 | | static void categorize(struct parse *, struct re_guts *); |
107 | | static sopno dupl(struct parse *, sopno, sopno); |
108 | | static void doemit(struct parse *, sop, size_t); |
109 | | static void doinsert(struct parse *, sop, size_t, sopno); |
110 | | static void dofwd(struct parse *, sopno, sop); |
111 | | static void enlarge(struct parse *, sopno); |
112 | | static void stripsnug(struct parse *, struct re_guts *); |
113 | | static void findmust(struct parse *, struct re_guts *); |
114 | | static sopno pluscount(struct parse *, struct re_guts *); |
115 | | |
116 | | static char nuls[10]; /* place to point scanner in event of error */ |
117 | | |
118 | | /* |
119 | | * macros for use with parse structure |
120 | | * BEWARE: these know that the parse structure is named `p' !!! |
121 | | */ |
122 | 120M | #define PEEK() (*p->next) |
123 | 1.87M | #define PEEK2() (*(p->next+1)) |
124 | 123M | #define MORE() (p->next < p->end) |
125 | 21.0M | #define MORE2() (p->next+1 < p->end) |
126 | 26.9M | #define SEE(c) (26.9M MORE26.9M () && PEEK25.9M () == (c)25.9M ) |
127 | 19.2M | #define SEETWO(a, b) (19.2M MORE19.2M () && MORE219.2M () && PEEK19.2M () == (a)19.2M && PEEK21.82k () == (b)1.82k ) |
128 | 17.1M | #define EAT(c) ((17.1M SEE17.1M (c)) ? (1.25M NEXT1.25M (), 1) : 015.8M ) |
129 | 10.5M | #define EATTWO(a, b) ((10.5M SEETWO10.5M (a, b)) ? (0 NEXT20 (), 1) : 010.5M ) |
130 | 5.28M | #define NEXT() (p->next++) |
131 | 12 | #define NEXT2() (p->next += 2) |
132 | 0 | #define NEXTn(n) (p->next += (n)) |
133 | 39.5M | #define GETNEXT() (*p->next++) |
134 | 32.4k | #define SETERROR(e) seterr(p, (e)) |
135 | 31.7M | #define REQUIRE(co, e) (void)((31.7M co4.11M ) || SETERROR32.3k (e)) |
136 | | #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) |
137 | 4.11M | #define MUSTEAT(c, e) (4.11M REQUIRE4.11M (MORE() && GETNEXT() == (c), e)) |
138 | | #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) |
139 | 38.2M | #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) |
140 | 2.99M | #define INSERT(op, pos) doinsert(p, (sop)(op), 2.99M HERE2.99M ()-(pos)+1, pos) |
141 | 2.39M | #define AHEAD(pos) dofwd(p, pos, 2.39M HERE2.39M ()-(pos)) |
142 | 4.31M | #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) |
143 | 41.7M | #define HERE() (p->slen) |
144 | 3.37M | #define THERE() (p->slen - 1) |
145 | | #define THERETHERE() (p->slen - 2) |
146 | 0 | #define DROP(n) (p->slen -= (n)) |
147 | | |
148 | | #ifdef _POSIX2_RE_DUP_MAX |
149 | 24 | #define DUPMAX _POSIX2_RE_DUP_MAX |
150 | | #else |
151 | | #define DUPMAX 255 |
152 | | #endif |
153 | 0 | #define INFINITY (0 DUPMAX0 + 1) |
154 | | |
155 | | #ifndef NDEBUG |
156 | | static int never = 0; /* for use in asserts; shuts lint up */ |
157 | | #else |
158 | | #define never 0 /* some <assert.h>s have bugs too */ |
159 | | #endif |
160 | | |
161 | | /* |
162 | | - llvm_regcomp - interface for parser and compilation |
163 | | */ |
164 | | int /* 0 success, otherwise REG_something */ |
165 | | llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) |
166 | 1.07M | { |
167 | 1.07M | struct parse pa; |
168 | 1.07M | struct re_guts *g; |
169 | 1.07M | struct parse *p = &pa; |
170 | 1.07M | int i; |
171 | 1.07M | size_t len; |
172 | | #ifdef REDEBUG |
173 | | # define GOODFLAGS(f) (f) |
174 | | #else |
175 | 1.07M | # define GOODFLAGS(f) ((f)&~1.07M REG_DUMP1.07M ) |
176 | 1.07M | #endif |
177 | 1.07M | |
178 | 1.07M | cflags = GOODFLAGS(cflags); |
179 | 1.07M | if ((cflags&1.07M REG_EXTENDED1.07M ) && (cflags&1.07M REG_NOSPEC1.07M )) |
180 | 0 | return(0 REG_INVARG0 ); |
181 | 1.07M | |
182 | 1.07M | if (1.07M cflags&1.07M REG_PEND1.07M ) { |
183 | 1.07M | if (preg->re_endp < pattern) |
184 | 0 | return(0 REG_INVARG0 ); |
185 | 1.07M | len = preg->re_endp - pattern; |
186 | 1.07M | } else |
187 | 0 | len = strlen((const char *)pattern); |
188 | 1.07M | |
189 | 1.07M | /* do the mallocs early so failure handling is easy */ |
190 | 1.07M | g = (struct re_guts *)malloc(sizeof(struct re_guts) + |
191 | 1.07M | (NC-1)*sizeof(cat_t)); |
192 | 1.07M | if (g == NULL) |
193 | 0 | return(0 REG_ESPACE0 ); |
194 | 1.07M | p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ |
195 | 1.07M | p->strip = (sop *)calloc(p->ssize, sizeof(sop)); |
196 | 1.07M | p->slen = 0; |
197 | 1.07M | if (p->strip == NULL1.07M ) { |
198 | 0 | free((char *)g); |
199 | 0 | return(REG_ESPACE); |
200 | 0 | } |
201 | 1.07M | |
202 | 1.07M | /* set things up */ |
203 | 1.07M | p->g = g; |
204 | 1.07M | p->next = (char *)pattern; /* convenience; we do not modify it */ |
205 | 1.07M | p->end = p->next + len; |
206 | 1.07M | p->error = 0; |
207 | 1.07M | p->ncsalloc = 0; |
208 | 11.7M | for (i = 0; i < 11.7M NPAREN11.7M ; i++10.7M ) { |
209 | 10.7M | p->pbegin[i] = 0; |
210 | 10.7M | p->pend[i] = 0; |
211 | 10.7M | } |
212 | 1.07M | g->csetsize = NC; |
213 | 1.07M | g->sets = NULL; |
214 | 1.07M | g->setbits = NULL; |
215 | 1.07M | g->ncsets = 0; |
216 | 1.07M | g->cflags = cflags; |
217 | 1.07M | g->iflags = 0; |
218 | 1.07M | g->nbol = 0; |
219 | 1.07M | g->neol = 0; |
220 | 1.07M | g->must = NULL; |
221 | 1.07M | g->mlen = 0; |
222 | 1.07M | g->nsub = 0; |
223 | 1.07M | g->ncategories = 1; /* category 0 is "everything else" */ |
224 | 1.07M | g->categories = &g->catspace[-(CHAR_MIN)]; |
225 | 1.07M | (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); |
226 | 1.07M | g->backrefs = 0; |
227 | 1.07M | |
228 | 1.07M | /* do it */ |
229 | 1.07M | EMIT(OEND, 0); |
230 | 1.07M | g->firststate = THERE(); |
231 | 1.07M | if (cflags&1.07M REG_EXTENDED1.07M ) |
232 | 1.07M | p_ere(p, 1.07M OUT1.07M ); |
233 | 0 | else if (0 cflags&0 REG_NOSPEC0 ) |
234 | 0 | p_str(p); |
235 | 0 | else |
236 | 0 | p_bre(p, 0 OUT0 , OUT0 ); |
237 | 1.07M | EMIT(OEND, 0); |
238 | 1.07M | g->laststate = THERE(); |
239 | 1.07M | |
240 | 1.07M | /* tidy up loose ends and fill things in */ |
241 | 1.07M | categorize(p, g); |
242 | 1.07M | stripsnug(p, g); |
243 | 1.07M | findmust(p, g); |
244 | 1.07M | g->nplus = pluscount(p, g); |
245 | 1.07M | g->magic = MAGIC2; |
246 | 1.07M | preg->re_nsub = g->nsub; |
247 | 1.07M | preg->re_g = g; |
248 | 1.07M | preg->re_magic = MAGIC1; |
249 | 1.07M | #ifndef REDEBUG |
250 | 1.07M | /* not debugging, so can't rely on the assert() in llvm_regexec() */ |
251 | 1.07M | if (g->iflags&1.07M REGEX_BAD1.07M ) |
252 | 0 | SETERROR(REG_ASSERT); |
253 | 1.07M | #endif |
254 | 1.07M | |
255 | 1.07M | /* win or lose, we're done */ |
256 | 1.07M | if (p->error != 0) /* lose */ |
257 | 32.3k | llvm_regfree(preg); |
258 | 1.07M | return(p->error); |
259 | 1.07M | } |
260 | | |
261 | | /* |
262 | | - p_ere - ERE parser top level, concatenation and alternation |
263 | | */ |
264 | | static void |
265 | | p_ere(struct parse *p, int stop) /* character this ERE should end at */ |
266 | 2.22M | { |
267 | 2.22M | char c; |
268 | 2.22M | sopno prevback = 0; |
269 | 2.22M | sopno prevfwd = 0; |
270 | 2.22M | sopno conc; |
271 | 2.22M | int first = 1; /* is this the first alternative? */ |
272 | 2.22M | |
273 | 3.45M | for (;;) { |
274 | 3.45M | /* do a bunch of concatenated expressions */ |
275 | 3.45M | conc = HERE(); |
276 | 29.7M | while (MORE29.7M () && 29.7M (c = 28.7M PEEK28.7M ()) != '|' && c != stop27.4M ) |
277 | 26.3M | p_ere_exp(p); |
278 | 3.45M | REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ |
279 | 3.45M | |
280 | 3.45M | if (!3.45M EAT3.45M ('|')) |
281 | 2.22M | break; /* NOTE BREAK OUT */ |
282 | 1.23M | |
283 | 1.23M | if (1.23M first1.23M ) { |
284 | 995k | INSERT(OCH_, conc); /* offset is wrong */ |
285 | 995k | prevfwd = conc; |
286 | 995k | prevback = conc; |
287 | 995k | first = 0; |
288 | 995k | } |
289 | 1.23M | ASTERN(OOR1, prevback); |
290 | 1.23M | prevback = THERE(); |
291 | 1.23M | AHEAD(prevfwd); /* fix previous offset */ |
292 | 1.23M | prevfwd = HERE(); |
293 | 1.23M | EMIT(OOR2, 0); /* offset is very wrong */ |
294 | 3.45M | } |
295 | 2.22M | |
296 | 2.22M | if (!first2.22M ) { /* tail-end fixups */ |
297 | 995k | AHEAD(prevfwd); |
298 | 995k | ASTERN(O_CH, prevback); |
299 | 995k | } |
300 | 2.22M | |
301 | 2.22M | assert(!MORE() || SEE(stop)); |
302 | 2.22M | } |
303 | | |
304 | | /* |
305 | | - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op |
306 | | */ |
307 | | static void |
308 | | p_ere_exp(struct parse *p) |
309 | 26.3M | { |
310 | 26.3M | char c; |
311 | 26.3M | sopno pos; |
312 | 26.3M | int count; |
313 | 26.3M | int count2; |
314 | 26.3M | int backrefnum; |
315 | 26.3M | sopno subno; |
316 | 26.3M | int wascaret = 0; |
317 | 26.3M | |
318 | 26.3M | assert(MORE()); /* caller should have ensured this */ |
319 | 26.3M | c = GETNEXT(); |
320 | 26.3M | |
321 | 26.3M | pos = HERE(); |
322 | 26.3M | switch (c) { |
323 | 1.15M | case '(': |
324 | 1.15M | REQUIRE(MORE(), REG_EPAREN); |
325 | 1.15M | p->g->nsub++; |
326 | 1.15M | subno = p->g->nsub; |
327 | 1.15M | if (subno < 1.15M NPAREN1.15M ) |
328 | 1.15M | p->pbegin[subno] = 1.15M HERE1.15M (); |
329 | 1.15M | EMIT(OLPAREN, subno); |
330 | 1.15M | if (!1.15M SEE1.15M (')')) |
331 | 1.15M | p_ere(p, ')'); |
332 | 1.15M | if (subno < 1.15M NPAREN1.15M ) { |
333 | 1.15M | p->pend[subno] = HERE(); |
334 | 1.15M | assert(p->pend[subno] != 0); |
335 | 1.15M | } |
336 | 1.15M | EMIT(ORPAREN, subno); |
337 | 1.15M | MUSTEAT(')', REG_EPAREN); |
338 | 1.15M | break; |
339 | 26.3M | #ifndef POSIX_MISTAKE |
340 | 0 | case ')': /* happens only if no current unmatched ( */ |
341 | 0 | /* |
342 | 0 | * You may ask, why the ifndef? Because I didn't notice |
343 | 0 | * this until slightly too late for 1003.2, and none of the |
344 | 0 | * other 1003.2 regular-expression reviewers noticed it at |
345 | 0 | * all. So an unmatched ) is legal POSIX, at least until |
346 | 0 | * we can get it fixed. |
347 | 0 | */ |
348 | 0 | SETERROR(REG_EPAREN); |
349 | 0 | break; |
350 | 26.3M | #endif |
351 | 1.03M | case '^': |
352 | 1.03M | EMIT(OBOL, 0); |
353 | 1.03M | p->g->iflags |= USEBOL; |
354 | 1.03M | p->g->nbol++; |
355 | 1.03M | wascaret = 1; |
356 | 1.03M | break; |
357 | 991k | case '$': |
358 | 991k | EMIT(OEOL, 0); |
359 | 991k | p->g->iflags |= USEEOL; |
360 | 991k | p->g->neol++; |
361 | 991k | break; |
362 | 0 | case '|': |
363 | 0 | SETERROR(REG_EMPTY); |
364 | 0 | break; |
365 | 4 | case '*': |
366 | 4 | case '+': |
367 | 4 | case '?': |
368 | 4 | SETERROR(REG_BADRPT); |
369 | 4 | break; |
370 | 1.60k | case '.': |
371 | 1.60k | if (p->g->cflags&1.60k REG_NEWLINE1.60k ) |
372 | 0 | nonnewline(p); |
373 | 1.60k | else |
374 | 1.60k | EMIT(OANY, 0); |
375 | 1.60k | break; |
376 | 2.87M | case '[': |
377 | 2.87M | p_bracket(p); |
378 | 2.87M | break; |
379 | 2.70M | case '\\': |
380 | 2.70M | REQUIRE(MORE(), REG_EESCAPE); |
381 | 2.70M | c = GETNEXT(); |
382 | 2.70M | if (c >= '1' && 2.70M c <= '9'45 ) { |
383 | 5 | /* \[0-9] is taken to be a back-reference to a previously specified |
384 | 5 | * matching group. backrefnum will hold the number. The matching |
385 | 5 | * group must exist (i.e. if \4 is found there must have been at |
386 | 5 | * least 4 matching groups specified in the pattern previously). |
387 | 5 | */ |
388 | 5 | backrefnum = c - '0'; |
389 | 5 | if (p->pend[backrefnum] == 05 ) { |
390 | 0 | SETERROR(REG_ESUBREG); |
391 | 0 | break; |
392 | 0 | } |
393 | 5 | |
394 | 5 | /* Make sure everything checks out and emit the sequence |
395 | 5 | * that marks a back-reference to the parse structure. |
396 | 5 | */ |
397 | 5 | assert(backrefnum <= p->g->nsub); |
398 | 5 | EMIT(OBACK_, backrefnum); |
399 | 5 | assert(p->pbegin[backrefnum] != 0); |
400 | 5 | assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN); |
401 | 5 | assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN); |
402 | 5 | (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]); |
403 | 5 | EMIT(O_BACK, backrefnum); |
404 | 5 | p->g->backrefs = 1; |
405 | 2.70M | } else { |
406 | 2.70M | /* Other chars are simply themselves when escaped with a backslash. |
407 | 2.70M | */ |
408 | 2.70M | ordinary(p, c); |
409 | 2.70M | } |
410 | 2.70M | break; |
411 | 1.82k | case '{': /* okay as ordinary except if digit follows */ |
412 | 1.82k | REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); |
413 | 1.82k | /* FALLTHROUGH */ |
414 | 17.5M | default: |
415 | 17.5M | ordinary(p, c); |
416 | 17.5M | break; |
417 | 26.3M | } |
418 | 26.3M | |
419 | 26.3M | if (26.3M !26.3M MORE26.3M ()) |
420 | 1.03M | return; |
421 | 25.2M | c = 25.2M PEEK25.2M (); |
422 | 25.2M | /* we call { a repetition if followed by a digit */ |
423 | 25.2M | if (!( c == '*' || 25.2M c == '+'24.3M || c == '?'24.2M || |
424 | 24.1M | (c == '{' && 24.1M MORE212 () && isdigit((uch)12 PEEK212 ())) )) |
425 | 24.1M | return; /* no repetition, we're done */ |
426 | 1.07M | NEXT1.07M (); |
427 | 1.07M | |
428 | 1.07M | REQUIRE(!wascaret, REG_BADRPT); |
429 | 1.07M | switch (c) { |
430 | 922k | case '*': /* implemented as +? */ |
431 | 922k | /* this case does not require the (y|) trick, noKLUDGE */ |
432 | 922k | INSERT(OPLUS_, pos); |
433 | 922k | ASTERN(O_PLUS, pos); |
434 | 922k | INSERT(OQUEST_, pos); |
435 | 922k | ASTERN(O_QUEST, pos); |
436 | 922k | break; |
437 | 73.3k | case '+': |
438 | 73.3k | INSERT(OPLUS_, pos); |
439 | 73.3k | ASTERN(O_PLUS, pos); |
440 | 73.3k | break; |
441 | 84.2k | case '?': |
442 | 84.2k | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
443 | 84.2k | INSERT(OCH_, pos); /* offset slightly wrong */ |
444 | 84.2k | ASTERN(OOR1, pos); /* this one's right */ |
445 | 84.2k | AHEAD(pos); /* fix the OCH_ */ |
446 | 84.2k | EMIT(OOR2, 0); /* offset very wrong... */ |
447 | 84.2k | AHEAD(THERE()); /* ...so fix it */ |
448 | 84.2k | ASTERN(O_CH, THERETHERE()); |
449 | 84.2k | break; |
450 | 12 | case '{': |
451 | 12 | count = p_count(p); |
452 | 12 | if (EAT12 (',')) { |
453 | 12 | if (isdigit((uch)12 PEEK12 ())) { |
454 | 12 | count2 = p_count(p); |
455 | 12 | REQUIRE(count <= count2, REG_BADBR); |
456 | 12 | } else /* single number with comma */ |
457 | 0 | count2 = 0 INFINITY0 ; |
458 | 12 | } else /* just a single number */ |
459 | 0 | count2 = count; |
460 | 12 | repeat(p, pos, count, count2); |
461 | 12 | if (!12 EAT12 ('}')) { /* error heuristics */ |
462 | 0 | while (MORE0 () && 0 PEEK0 () != '}'0 ) |
463 | 0 | NEXT(); |
464 | 0 | REQUIRE(MORE(), REG_EBRACE); |
465 | 0 | SETERROR(REG_BADBR); |
466 | 0 | } |
467 | 12 | break; |
468 | 1.07M | } |
469 | 1.07M | |
470 | 1.07M | if (1.07M !1.07M MORE1.07M ()) |
471 | 636 | return; |
472 | 1.07M | c = 1.07M PEEK1.07M (); |
473 | 1.07M | if (!( c == '*' || 1.07M c == '+'1.07M || c == '?'1.07M || |
474 | 1.07M | (c == '{' && 1.07M MORE21.82k () && isdigit((uch)1.82k PEEK21.82k ())) ) ) |
475 | 1.07M | return; |
476 | 0 | SETERROR0 (REG_BADRPT); |
477 | 0 | } |
478 | | |
479 | | /* |
480 | | - p_str - string (no metacharacters) "parser" |
481 | | */ |
482 | | static void |
483 | | p_str(struct parse *p) |
484 | 0 | { |
485 | 0 | REQUIRE(MORE(), REG_EMPTY); |
486 | 0 | while (MORE()) |
487 | 0 | ordinary(p, 0 GETNEXT0 ()); |
488 | 0 | } |
489 | | |
490 | | /* |
491 | | - p_bre - BRE parser top level, anchoring and concatenation |
492 | | * Giving end1 as OUT essentially eliminates the end1/end2 check. |
493 | | * |
494 | | * This implementation is a bit of a kludge, in that a trailing $ is first |
495 | | * taken as an ordinary character and then revised to be an anchor. The |
496 | | * only undesirable side effect is that '$' gets included as a character |
497 | | * category in such cases. This is fairly harmless; not worth fixing. |
498 | | * The amount of lookahead needed to avoid this kludge is excessive. |
499 | | */ |
500 | | static void |
501 | | p_bre(struct parse *p, |
502 | | int end1, /* first terminating character */ |
503 | | int end2) /* second terminating character */ |
504 | 0 | { |
505 | 0 | sopno start = HERE(); |
506 | 0 | int first = 1; /* first subexpression? */ |
507 | 0 | int wasdollar = 0; |
508 | 0 |
|
509 | 0 | if (EAT0 ('^')) { |
510 | 0 | EMIT(OBOL, 0); |
511 | 0 | p->g->iflags |= USEBOL; |
512 | 0 | p->g->nbol++; |
513 | 0 | } |
514 | 0 | while (MORE0 () && 0 !0 SEETWO0 (end1, end2)) { |
515 | 0 | wasdollar = p_simp_re(p, first); |
516 | 0 | first = 0; |
517 | 0 | } |
518 | 0 | if (wasdollar0 ) { /* oops, that was a trailing anchor */ |
519 | 0 | DROP(1); |
520 | 0 | EMIT(OEOL, 0); |
521 | 0 | p->g->iflags |= USEEOL; |
522 | 0 | p->g->neol++; |
523 | 0 | } |
524 | 0 |
|
525 | 0 | REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ |
526 | 0 | } |
527 | | |
528 | | /* |
529 | | - p_simp_re - parse a simple RE, an atom possibly followed by a repetition |
530 | | */ |
531 | | static int /* was the simple RE an unbackslashed $? */ |
532 | | p_simp_re(struct parse *p, |
533 | | int starordinary) /* is a leading * an ordinary character? */ |
534 | 0 | { |
535 | 0 | int c; |
536 | 0 | int count; |
537 | 0 | int count2; |
538 | 0 | sopno pos; |
539 | 0 | int i; |
540 | 0 | sopno subno; |
541 | 0 | # define BACKSL (1<<CHAR_BIT) |
542 | 0 |
|
543 | 0 | pos = HERE(); /* repetition op, if any, covers from here */ |
544 | 0 |
|
545 | 0 | assert(MORE()); /* caller should have ensured this */ |
546 | 0 | c = GETNEXT(); |
547 | 0 | if (c == '\\'0 ) { |
548 | 0 | REQUIRE(MORE(), REG_EESCAPE); |
549 | 0 | c = BACKSL0 | GETNEXT0 (); |
550 | 0 | } |
551 | 0 | switch (c) { |
552 | 0 | case '.': |
553 | 0 | if (p->g->cflags&0 REG_NEWLINE0 ) |
554 | 0 | nonnewline(p); |
555 | 0 | else |
556 | 0 | EMIT(OANY, 0); |
557 | 0 | break; |
558 | 0 | case '[': |
559 | 0 | p_bracket(p); |
560 | 0 | break; |
561 | 0 | case 0 BACKSL0 |'{': |
562 | 0 | SETERROR(REG_BADRPT); |
563 | 0 | break; |
564 | 0 | case 0 BACKSL0 |'(': |
565 | 0 | p->g->nsub++; |
566 | 0 | subno = p->g->nsub; |
567 | 0 | if (subno < 0 NPAREN0 ) |
568 | 0 | p->pbegin[subno] = 0 HERE0 (); |
569 | 0 | EMIT(OLPAREN, subno); |
570 | 0 | /* the MORE here is an error heuristic */ |
571 | 0 | if (MORE0 () && 0 !0 SEETWO0 ('\\', ')')) |
572 | 0 | p_bre(p, '\\', ')'); |
573 | 0 | if (subno < 0 NPAREN0 ) { |
574 | 0 | p->pend[subno] = HERE(); |
575 | 0 | assert(p->pend[subno] != 0); |
576 | 0 | } |
577 | 0 | EMIT(ORPAREN, subno); |
578 | 0 | REQUIRE(EATTWO('\\', ')'), REG_EPAREN); |
579 | 0 | break; |
580 | 0 | case 0 BACKSL0 |')': /* should not get here -- must be user */ |
581 | 0 | case 0 BACKSL0 |'}': |
582 | 0 | SETERROR(REG_EPAREN); |
583 | 0 | break; |
584 | 0 | case 0 BACKSL0 |'1': |
585 | 0 | case 0 BACKSL0 |'2': |
586 | 0 | case 0 BACKSL0 |'3': |
587 | 0 | case 0 BACKSL0 |'4': |
588 | 0 | case 0 BACKSL0 |'5': |
589 | 0 | case 0 BACKSL0 |'6': |
590 | 0 | case 0 BACKSL0 |'7': |
591 | 0 | case 0 BACKSL0 |'8': |
592 | 0 | case 0 BACKSL0 |'9': |
593 | 0 | i = (c&~BACKSL) - '0'; |
594 | 0 | assert(i < NPAREN); |
595 | 0 | if (p->pend[i] != 00 ) { |
596 | 0 | assert(i <= p->g->nsub); |
597 | 0 | EMIT(OBACK_, i); |
598 | 0 | assert(p->pbegin[i] != 0); |
599 | 0 | assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); |
600 | 0 | assert(OP(p->strip[p->pend[i]]) == ORPAREN); |
601 | 0 | (void) dupl(p, p->pbegin[i]+1, p->pend[i]); |
602 | 0 | EMIT(O_BACK, i); |
603 | 0 | } else |
604 | 0 | SETERROR(REG_ESUBREG); |
605 | 0 | p->g->backrefs = 1; |
606 | 0 | break; |
607 | 0 | case '*': |
608 | 0 | REQUIRE(starordinary, REG_BADRPT); |
609 | 0 | /* FALLTHROUGH */ |
610 | 0 | default: |
611 | 0 | ordinary(p, (char)c); |
612 | 0 | break; |
613 | 0 | } |
614 | 0 |
|
615 | 0 | if (0 EAT0 ('*')) { /* implemented as +? */ |
616 | 0 | /* this case does not require the (y|) trick, noKLUDGE */ |
617 | 0 | INSERT(OPLUS_, pos); |
618 | 0 | ASTERN(O_PLUS, pos); |
619 | 0 | INSERT(OQUEST_, pos); |
620 | 0 | ASTERN(O_QUEST, pos); |
621 | 0 | } else if (0 EATTWO0 ('\\', '{')) { |
622 | 0 | count = p_count(p); |
623 | 0 | if (EAT0 (',')) { |
624 | 0 | if (MORE0 () && 0 isdigit((uch)0 PEEK0 ())) { |
625 | 0 | count2 = p_count(p); |
626 | 0 | REQUIRE(count <= count2, REG_BADBR); |
627 | 0 | } else /* single number with comma */ |
628 | 0 | count2 = 0 INFINITY0 ; |
629 | 0 | } else /* just a single number */ |
630 | 0 | count2 = count; |
631 | 0 | repeat(p, pos, count, count2); |
632 | 0 | if (!0 EATTWO0 ('\\', '}')) { /* error heuristics */ |
633 | 0 | while (MORE0 () && 0 !0 SEETWO0 ('\\', '}')) |
634 | 0 | NEXT(); |
635 | 0 | REQUIRE(MORE(), REG_EBRACE); |
636 | 0 | SETERROR(REG_BADBR); |
637 | 0 | } |
638 | 0 | } else if (0 c == '$'0 ) /* $ (but not \$) ends it */ |
639 | 0 | return(1); |
640 | 0 |
|
641 | 0 | return(0); |
642 | 0 | } |
643 | | |
644 | | /* |
645 | | - p_count - parse a repetition count |
646 | | */ |
647 | | static int /* the value */ |
648 | | p_count(struct parse *p) |
649 | 24 | { |
650 | 24 | int count = 0; |
651 | 24 | int ndigits = 0; |
652 | 24 | |
653 | 48 | while (MORE48 () && 48 isdigit((uch)48 PEEK48 ()) && count <= 24 DUPMAX24 ) { |
654 | 24 | count = count*10 + (GETNEXT() - '0'); |
655 | 24 | ndigits++; |
656 | 24 | } |
657 | 24 | |
658 | 24 | REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); |
659 | 24 | return(count); |
660 | 24 | } |
661 | | |
662 | | /* |
663 | | - p_bracket - parse a bracketed character list |
664 | | * |
665 | | * Note a significant property of this code: if the allocset() did SETERROR, |
666 | | * no set operations are done. |
667 | | */ |
668 | | static void |
669 | | p_bracket(struct parse *p) |
670 | 2.95M | { |
671 | 2.95M | cset *cs; |
672 | 2.95M | int invert = 0; |
673 | 2.95M | |
674 | 2.95M | /* Dept of Truly Sickening Special-Case Kludges */ |
675 | 2.95M | if (p->next + 5 < p->end && 2.95M strncmp(p->next, "[:<:]]", 6) == 02.87M ) { |
676 | 0 | EMIT(OBOW, 0); |
677 | 0 | NEXTn(6); |
678 | 0 | return; |
679 | 0 | } |
680 | 2.95M | if (2.95M p->next + 5 < p->end && 2.95M strncmp(p->next, "[:>:]]", 6) == 02.87M ) { |
681 | 0 | EMIT(OEOW, 0); |
682 | 0 | NEXTn(6); |
683 | 0 | return; |
684 | 0 | } |
685 | 2.95M | |
686 | 2.95M | if (2.95M (cs = allocset(p)) == NULL2.95M ) { |
687 | 0 | /* allocset did set error status in p */ |
688 | 0 | return; |
689 | 0 | } |
690 | 2.95M | |
691 | 2.95M | if (2.95M EAT2.95M ('^')) |
692 | 1.11k | invert++; /* make note to invert set at end */ |
693 | 2.95M | if (EAT(']')) |
694 | 0 | CHadd(cs, ']'); |
695 | 2.95M | else if (2.95M EAT2.95M ('-')) |
696 | 22.3k | CHadd(cs, '-'); |
697 | 11.6M | while (MORE11.6M () && 11.6M PEEK11.6M () != ']'11.6M && !8.67M SEETWO8.67M ('-', ']')) |
698 | 8.67M | p_b_term(p, cs); |
699 | 2.95M | if (EAT('-')) |
700 | 1.82k | CHadd(cs, '-'); |
701 | 2.95M | MUSTEAT(']', REG_EBRACK); |
702 | 2.95M | |
703 | 2.95M | if (p->error != 02.95M ) { /* don't mess things up further */ |
704 | 3 | freeset(p, cs); |
705 | 3 | return; |
706 | 3 | } |
707 | 2.95M | |
708 | 2.95M | if (2.95M p->g->cflags&2.95M REG_ICASE2.95M ) { |
709 | 85.8k | int i; |
710 | 85.8k | int ci; |
711 | 85.8k | |
712 | 22.0M | for (i = p->g->csetsize - 1; i >= 022.0M ; i--21.9M ) |
713 | 21.9M | if (21.9M CHIN21.9M (cs, i) && 21.9M isalpha(i)304k ) { |
714 | 274k | ci = othercase(i); |
715 | 274k | if (ci != i) |
716 | 274k | CHadd(cs, ci); |
717 | 21.9M | } |
718 | 85.8k | if (cs->multis != NULL) |
719 | 0 | mccase(p, cs); |
720 | 85.8k | } |
721 | 2.95M | if (invert2.95M ) { |
722 | 1.11k | int i; |
723 | 1.11k | |
724 | 287k | for (i = p->g->csetsize - 1; i >= 0287k ; i--286k ) |
725 | 286k | if (286k CHIN286k (cs, i)) |
726 | 2.12k | CHsub(cs, i); |
727 | 286k | else |
728 | 284k | CHadd(cs, i); |
729 | 1.11k | if (p->g->cflags&1.11k REG_NEWLINE1.11k ) |
730 | 0 | CHsub(cs, '\n'); |
731 | 1.11k | if (cs->multis != NULL) |
732 | 0 | mcinvert(p, cs); |
733 | 1.11k | } |
734 | 2.95M | |
735 | 2.95M | assert(cs->multis == NULL); /* xxx */ |
736 | 2.95M | |
737 | 2.95M | if (nch(p, cs) == 12.95M ) { /* optimize singleton sets */ |
738 | 0 | ordinary(p, firstch(p, cs)); |
739 | 0 | freeset(p, cs); |
740 | 0 | } else |
741 | 2.95M | EMIT(OANYOF, freezeset(p, cs)); |
742 | 2.95M | } |
743 | | |
744 | | /* |
745 | | - p_b_term - parse one term of a bracketed character list |
746 | | */ |
747 | | static void |
748 | | p_b_term(struct parse *p, cset *cs) |
749 | 8.67M | { |
750 | 8.67M | char c; |
751 | 8.67M | char start, finish; |
752 | 8.67M | int i; |
753 | 8.67M | |
754 | 8.67M | /* classify what we've got */ |
755 | 8.67M | switch ((MORE8.67M ()) ? PEEK8.67M () : '\0'0 ) { |
756 | 15 | case '[': |
757 | 15 | c = (MORE215 ()) ? PEEK215 () : '\0'0 ; |
758 | 15 | break; |
759 | 2 | case '-': |
760 | 2 | SETERROR(REG_ERANGE); |
761 | 2 | return; /* NOTE RETURN */ |
762 | 0 | break; |
763 | 8.67M | default: |
764 | 8.67M | c = '\0'; |
765 | 8.67M | break; |
766 | 8.67M | } |
767 | 8.67M | |
768 | 8.67M | switch (c) { |
769 | 12 | case ':': /* character class */ |
770 | 12 | NEXT2(); |
771 | 12 | REQUIRE(MORE(), REG_EBRACK); |
772 | 12 | c = PEEK(); |
773 | 12 | REQUIRE(c != '-' && c != ']', REG_ECTYPE); |
774 | 12 | p_b_cclass(p, cs); |
775 | 12 | REQUIRE(MORE(), REG_EBRACK); |
776 | 12 | REQUIRE(EATTWO(':', ']'), REG_ECTYPE); |
777 | 12 | break; |
778 | 0 | case '=': /* equivalence class */ |
779 | 0 | NEXT2(); |
780 | 0 | REQUIRE(MORE(), REG_EBRACK); |
781 | 0 | c = PEEK(); |
782 | 0 | REQUIRE(c != '-' && c != ']', REG_ECOLLATE); |
783 | 0 | p_b_eclass(p, cs); |
784 | 0 | REQUIRE(MORE(), REG_EBRACK); |
785 | 0 | REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); |
786 | 0 | break; |
787 | 8.67M | default: /* symbol, ordinary character, or range */ |
788 | 8.67M | /* xxx revision needed for multichar stuff */ |
789 | 8.67M | start = p_b_symbol(p); |
790 | 8.67M | if (SEE8.67M ('-') && 8.67M MORE21.86M () && PEEK21.86M () != ']'1.86M ) { |
791 | 1.86M | /* range */ |
792 | 1.86M | NEXT(); |
793 | 1.86M | if (EAT('-')) |
794 | 0 | finish = '-'; |
795 | 1.86M | else |
796 | 1.86M | finish = p_b_symbol(p); |
797 | 1.86M | } else |
798 | 6.80M | finish = start; |
799 | 8.67M | /* xxx what about signed chars here... */ |
800 | 8.67M | REQUIRE(start <= finish, REG_ERANGE); |
801 | 48.4M | for (i = start; i <= finish48.4M ; i++39.7M ) |
802 | 39.7M | CHadd(cs, i); |
803 | 8.67M | break; |
804 | 8.67M | } |
805 | 8.67M | } |
806 | | |
807 | | /* |
808 | | - p_b_cclass - parse a character-class name and deal with it |
809 | | */ |
810 | | static void |
811 | | p_b_cclass(struct parse *p, cset *cs) |
812 | 12 | { |
813 | 12 | char *sp = p->next; |
814 | 12 | struct cclass *cp; |
815 | 12 | size_t len; |
816 | 12 | const char *u; |
817 | 12 | char c; |
818 | 12 | |
819 | 72 | while (MORE72 () && 72 isalpha((uch)72 PEEK72 ())) |
820 | 60 | NEXT(); |
821 | 12 | len = p->next - sp; |
822 | 52 | for (cp = cclasses; cp->name != NULL52 ; cp++40 ) |
823 | 52 | if (52 strncmp(cp->name, sp, len) == 0 && 52 cp->name[len] == '\0'12 ) |
824 | 12 | break; |
825 | 12 | if (cp->name == NULL12 ) { |
826 | 0 | /* oops, didn't find it */ |
827 | 0 | SETERROR(REG_ECTYPE); |
828 | 0 | return; |
829 | 0 | } |
830 | 12 | |
831 | 12 | u = cp->chars; |
832 | 160 | while ((c = *u++) != '\0') |
833 | 148 | CHadd(cs, c); |
834 | 12 | for (u = cp->multis; *u != '\0'12 ; u += strlen(u) + 10 ) |
835 | 0 | MCadd(p, cs, u); |
836 | 12 | } |
837 | | |
838 | | /* |
839 | | - p_b_eclass - parse an equivalence-class name and deal with it |
840 | | * |
841 | | * This implementation is incomplete. xxx |
842 | | */ |
843 | | static void |
844 | | p_b_eclass(struct parse *p, cset *cs) |
845 | 0 | { |
846 | 0 | char c; |
847 | 0 |
|
848 | 0 | c = p_b_coll_elem(p, '='); |
849 | 0 | CHadd(cs, c); |
850 | 0 | } |
851 | | |
852 | | /* |
853 | | - p_b_symbol - parse a character or [..]ed multicharacter collating symbol |
854 | | */ |
855 | | static char /* value of symbol */ |
856 | | p_b_symbol(struct parse *p) |
857 | 10.5M | { |
858 | 10.5M | char value; |
859 | 10.5M | |
860 | 10.5M | REQUIRE(MORE(), REG_EBRACK); |
861 | 10.5M | if (!10.5M EATTWO10.5M ('[', '.')) |
862 | 10.5M | return(10.5M GETNEXT10.5M ()); |
863 | 0 |
|
864 | 0 | /* collating symbol */ |
865 | 0 | value = p_b_coll_elem(p, '.'); |
866 | 0 | REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); |
867 | 10.5M | return(value); |
868 | 10.5M | } |
869 | | |
870 | | /* |
871 | | - p_b_coll_elem - parse a collating-element name and look it up |
872 | | */ |
873 | | static char /* value of collating element */ |
874 | | p_b_coll_elem(struct parse *p, |
875 | | int endc) /* name ended by endc,']' */ |
876 | 0 | { |
877 | 0 | char *sp = p->next; |
878 | 0 | struct cname *cp; |
879 | 0 | int len; |
880 | 0 |
|
881 | 0 | while (MORE0 () && 0 !0 SEETWO0 (endc, ']')) |
882 | 0 | NEXT(); |
883 | 0 | if (!0 MORE0 ()) { |
884 | 0 | SETERROR(REG_EBRACK); |
885 | 0 | return(0); |
886 | 0 | } |
887 | 0 | len = p->next - sp; |
888 | 0 | for (cp = cnames; cp->name != NULL0 ; cp++0 ) |
889 | 0 | if (0 strncmp(cp->name, sp, len) == 0 && 0 cp->name[len] == '\0'0 ) |
890 | 0 | return(cp->code); /* known name */ |
891 | 0 | if (0 len == 10 ) |
892 | 0 | return(*sp); /* single character */ |
893 | 0 | SETERROR0 (REG_ECOLLATE); /* neither */ |
894 | 0 | return(0); |
895 | 0 | } |
896 | | |
897 | | /* |
898 | | - othercase - return the case counterpart of an alphabetic |
899 | | */ |
900 | | static char /* if no counterpart, return ch */ |
901 | | othercase(int ch) |
902 | 356k | { |
903 | 356k | ch = (uch)ch; |
904 | 356k | assert(isalpha(ch)); |
905 | 356k | if (isupper(ch)) |
906 | 137k | return ((uch)tolower(ch)); |
907 | 218k | else if (218k islower(ch)218k ) |
908 | 218k | return ((uch)toupper(ch)); |
909 | 218k | else /* peculiar, but could happen */ |
910 | 0 | return(ch); |
911 | 0 | } |
912 | | |
913 | | /* |
914 | | - bothcases - emit a dualcase version of a two-case character |
915 | | * |
916 | | * Boy, is this implementation ever a kludge... |
917 | | */ |
918 | | static void |
919 | | bothcases(struct parse *p, int ch) |
920 | 81.4k | { |
921 | 81.4k | char *oldnext = p->next; |
922 | 81.4k | char *oldend = p->end; |
923 | 81.4k | char bracket[3]; |
924 | 81.4k | |
925 | 81.4k | ch = (uch)ch; |
926 | 81.4k | assert(othercase(ch) != ch); /* p_bracket() would recurse */ |
927 | 81.4k | p->next = bracket; |
928 | 81.4k | p->end = bracket+2; |
929 | 81.4k | bracket[0] = ch; |
930 | 81.4k | bracket[1] = ']'; |
931 | 81.4k | bracket[2] = '\0'; |
932 | 81.4k | p_bracket(p); |
933 | 81.4k | assert(p->next == bracket+2); |
934 | 81.4k | p->next = oldnext; |
935 | 81.4k | p->end = oldend; |
936 | 81.4k | } |
937 | | |
938 | | /* |
939 | | - ordinary - emit an ordinary character |
940 | | */ |
941 | | static void |
942 | | ordinary(struct parse *p, int ch) |
943 | 20.2M | { |
944 | 20.2M | cat_t *cap = p->g->categories; |
945 | 20.2M | |
946 | 20.2M | if ((p->g->cflags&20.2M REG_ICASE20.2M ) && isalpha((uch)ch)108k && othercase(ch) != ch81.4k ) |
947 | 81.4k | bothcases(p, ch); |
948 | 20.1M | else { |
949 | 20.1M | EMIT(OCHAR, (uch)ch); |
950 | 20.1M | if (cap[ch] == 0) |
951 | 11.1M | cap[ch] = p->g->ncategories++; |
952 | 20.1M | } |
953 | 20.2M | } |
954 | | |
955 | | /* |
956 | | - nonnewline - emit REG_NEWLINE version of OANY |
957 | | * |
958 | | * Boy, is this implementation ever a kludge... |
959 | | */ |
960 | | static void |
961 | | nonnewline(struct parse *p) |
962 | 0 | { |
963 | 0 | char *oldnext = p->next; |
964 | 0 | char *oldend = p->end; |
965 | 0 | char bracket[4]; |
966 | 0 |
|
967 | 0 | p->next = bracket; |
968 | 0 | p->end = bracket+3; |
969 | 0 | bracket[0] = '^'; |
970 | 0 | bracket[1] = '\n'; |
971 | 0 | bracket[2] = ']'; |
972 | 0 | bracket[3] = '\0'; |
973 | 0 | p_bracket(p); |
974 | 0 | assert(p->next == bracket+3); |
975 | 0 | p->next = oldnext; |
976 | 0 | p->end = oldend; |
977 | 0 | } |
978 | | |
979 | | /* |
980 | | - repeat - generate code for a bounded repetition, recursively if needed |
981 | | */ |
982 | | static void |
983 | | repeat(struct parse *p, |
984 | | sopno start, /* operand from here to end of strip */ |
985 | | int from, /* repeated from this number */ |
986 | | int to) /* to this number of times (maybe INFINITY) */ |
987 | 24 | { |
988 | 24 | sopno finish = HERE(); |
989 | 24 | # define N 2 |
990 | 24 | # define INF 3 |
991 | 48 | # define REP(f, t) ((48 f24 )*8 + (t36 )) |
992 | 24 | # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) |
993 | 24 | sopno copy; |
994 | 24 | |
995 | 24 | if (p->error != 0) /* head off possible runaway recursion */ |
996 | 0 | return; |
997 | 24 | |
998 | 24 | assert(from <= to); |
999 | 24 | |
1000 | 24 | switch (REP(MAP(from), MAP(to))) { |
1001 | 0 | case 0 REP0 (0, 0): /* must be user doing this */ |
1002 | 0 | DROP(finish-start); /* drop the operand */ |
1003 | 0 | break; |
1004 | 0 | case 0 REP0 (0, 1): /* as x{1,1}? */ |
1005 | 0 | case 0 REP0 (0, N): /* as x{1,n}? */ |
1006 | 0 | case 0 REP0 (0, INF): /* as x{1,}? */ |
1007 | 0 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
1008 | 0 | INSERT(OCH_, start); /* offset is wrong... */ |
1009 | 0 | repeat(p, start+1, 1, to); |
1010 | 0 | ASTERN(OOR1, start); |
1011 | 0 | AHEAD(start); /* ... fix it */ |
1012 | 0 | EMIT(OOR2, 0); |
1013 | 0 | AHEAD(THERE()); |
1014 | 0 | ASTERN(O_CH, THERETHERE()); |
1015 | 0 | break; |
1016 | 12 | case 12 REP12 (1, 1): /* trivial case */ |
1017 | 12 | /* done */ |
1018 | 12 | break; |
1019 | 12 | case 12 REP12 (1, N): /* as x?x{1,n-1} */ |
1020 | 12 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
1021 | 12 | INSERT(OCH_, start); |
1022 | 12 | ASTERN(OOR1, start); |
1023 | 12 | AHEAD(start); |
1024 | 12 | EMIT(OOR2, 0); /* offset very wrong... */ |
1025 | 12 | AHEAD(THERE()); /* ...so fix it */ |
1026 | 12 | ASTERN(O_CH, THERETHERE()); |
1027 | 12 | copy = dupl(p, start+1, finish+1); |
1028 | 12 | assert(copy == finish+4); |
1029 | 12 | repeat(p, copy, 1, to-1); |
1030 | 12 | break; |
1031 | 0 | case 0 REP0 (1, INF): /* as x+ */ |
1032 | 0 | INSERT(OPLUS_, start); |
1033 | 0 | ASTERN(O_PLUS, start); |
1034 | 0 | break; |
1035 | 0 | case 0 REP0 (N, N): /* as xx{m-1,n-1} */ |
1036 | 0 | copy = dupl(p, start, finish); |
1037 | 0 | repeat(p, copy, from-1, to-1); |
1038 | 0 | break; |
1039 | 0 | case 0 REP0 (N, INF): /* as xx{n-1,INF} */ |
1040 | 0 | copy = dupl(p, start, finish); |
1041 | 0 | repeat(p, copy, from-1, to); |
1042 | 0 | break; |
1043 | 0 | default: /* "can't happen" */ |
1044 | 0 | SETERROR(REG_ASSERT); /* just in case */ |
1045 | 0 | break; |
1046 | 24 | } |
1047 | 24 | } |
1048 | | |
1049 | | /* |
1050 | | - seterr - set an error condition |
1051 | | */ |
1052 | | static int /* useless but makes type checking happy */ |
1053 | | seterr(struct parse *p, int e) |
1054 | 32.4k | { |
1055 | 32.4k | if (p->error == 0) /* keep earliest error condition */ |
1056 | 32.3k | p->error = e; |
1057 | 32.4k | p->next = nuls; /* try to bring things to a halt */ |
1058 | 32.4k | p->end = nuls; |
1059 | 32.4k | return(0); /* make the return value well-defined */ |
1060 | 32.4k | } |
1061 | | |
1062 | | /* |
1063 | | - allocset - allocate a set of characters for [] |
1064 | | */ |
1065 | | static cset * |
1066 | | allocset(struct parse *p) |
1067 | 2.95M | { |
1068 | 2.95M | int no = p->g->ncsets++; |
1069 | 2.95M | size_t nc; |
1070 | 2.95M | size_t nbytes; |
1071 | 2.95M | cset *cs; |
1072 | 2.95M | size_t css = (size_t)p->g->csetsize; |
1073 | 2.95M | int i; |
1074 | 2.95M | |
1075 | 2.95M | if (no >= p->ncsalloc2.95M ) { /* need another column of space */ |
1076 | 988k | void *ptr; |
1077 | 988k | |
1078 | 988k | p->ncsalloc += CHAR_BIT; |
1079 | 988k | nc = p->ncsalloc; |
1080 | 988k | if (nc > SIZE_MAX / sizeof(cset)) |
1081 | 0 | goto nomem; |
1082 | 988k | assert(nc % CHAR_BIT == 0); |
1083 | 988k | nbytes = nc / CHAR_BIT * css; |
1084 | 988k | |
1085 | 988k | ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); |
1086 | 988k | if (ptr == NULL) |
1087 | 0 | goto nomem; |
1088 | 988k | p->g->sets = ptr; |
1089 | 988k | |
1090 | 988k | ptr = (uch *)realloc((char *)p->g->setbits, nbytes); |
1091 | 988k | if (ptr == NULL) |
1092 | 0 | goto nomem; |
1093 | 988k | p->g->setbits = ptr; |
1094 | 988k | |
1095 | 1.00M | for (i = 0; i < no1.00M ; i++21.1k ) |
1096 | 21.1k | p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); |
1097 | 988k | |
1098 | 988k | (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); |
1099 | 988k | } |
1100 | 2.95M | /* XXX should not happen */ |
1101 | 2.95M | if (2.95M p->g->sets == NULL || 2.95M p->g->setbits == NULL2.95M ) |
1102 | 0 | goto nomem; |
1103 | 2.95M | |
1104 | 2.95M | cs = &p->g->sets[no]; |
1105 | 2.95M | cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); |
1106 | 2.95M | cs->mask = 1 << ((no) % CHAR_BIT); |
1107 | 2.95M | cs->hash = 0; |
1108 | 2.95M | cs->smultis = 0; |
1109 | 2.95M | cs->multis = NULL; |
1110 | 2.95M | |
1111 | 2.95M | return(cs); |
1112 | 0 | nomem: |
1113 | 0 | free(p->g->sets); |
1114 | 0 | p->g->sets = NULL; |
1115 | 0 | free(p->g->setbits); |
1116 | 0 | p->g->setbits = NULL; |
1117 | 0 |
|
1118 | 0 | SETERROR(REG_ESPACE); |
1119 | 0 | /* caller's responsibility not to do set ops */ |
1120 | 0 | return(NULL); |
1121 | 2.95M | } |
1122 | | |
1123 | | /* |
1124 | | - freeset - free a now-unused set |
1125 | | */ |
1126 | | static void |
1127 | | freeset(struct parse *p, cset *cs) |
1128 | 114k | { |
1129 | 114k | size_t i; |
1130 | 114k | cset *top = &p->g->sets[p->g->ncsets]; |
1131 | 114k | size_t css = (size_t)p->g->csetsize; |
1132 | 114k | |
1133 | 29.4M | for (i = 0; i < css29.4M ; i++29.3M ) |
1134 | 29.3M | CHsub(cs, i); |
1135 | 114k | if (cs == top-1) /* recover only the easy case */ |
1136 | 114k | p->g->ncsets--; |
1137 | 114k | } |
1138 | | |
1139 | | /* |
1140 | | - freezeset - final processing on a set of characters |
1141 | | * |
1142 | | * The main task here is merging identical sets. This is usually a waste |
1143 | | * of time (although the hash code minimizes the overhead), but can win |
1144 | | * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash |
1145 | | * is done using addition rather than xor -- all ASCII [aA] sets xor to |
1146 | | * the same value! |
1147 | | */ |
1148 | | static int /* set number */ |
1149 | | freezeset(struct parse *p, cset *cs) |
1150 | 2.95M | { |
1151 | 2.95M | uch h = cs->hash; |
1152 | 2.95M | size_t i; |
1153 | 2.95M | cset *top = &p->g->sets[p->g->ncsets]; |
1154 | 2.95M | cset *cs2; |
1155 | 2.95M | size_t css = (size_t)p->g->csetsize; |
1156 | 2.95M | |
1157 | 2.95M | /* look for an earlier one which is the same */ |
1158 | 8.94M | for (cs2 = &p->g->sets[0]; cs2 < top8.94M ; cs2++5.99M ) |
1159 | 6.10M | if (6.10M cs2->hash == h && 6.10M cs2 != cs2.95M ) { |
1160 | 114k | /* maybe */ |
1161 | 29.4M | for (i = 0; i < css29.4M ; i++29.3M ) |
1162 | 29.3M | if (29.3M !!29.3M CHIN29.3M (cs2, i) != !!CHIN29.3M (cs, i)) |
1163 | 2 | break; /* no */ |
1164 | 114k | if (i == css) |
1165 | 114k | break; /* yes */ |
1166 | 6.10M | } |
1167 | 2.95M | |
1168 | 2.95M | if (cs2 < top2.95M ) { /* found one */ |
1169 | 114k | freeset(p, cs); |
1170 | 114k | cs = cs2; |
1171 | 114k | } |
1172 | 2.95M | |
1173 | 2.95M | return((int)(cs - p->g->sets)); |
1174 | 2.95M | } |
1175 | | |
1176 | | /* |
1177 | | - firstch - return first character in a set (which must have at least one) |
1178 | | */ |
1179 | | static int /* character; there is no "none" value */ |
1180 | | firstch(struct parse *p, cset *cs) |
1181 | 0 | { |
1182 | 0 | size_t i; |
1183 | 0 | size_t css = (size_t)p->g->csetsize; |
1184 | 0 |
|
1185 | 0 | for (i = 0; i < css0 ; i++0 ) |
1186 | 0 | if (0 CHIN0 (cs, i)) |
1187 | 0 | return((char)i); |
1188 | 0 | assert(never); |
1189 | 0 | return(0); /* arbitrary */ |
1190 | 0 | } |
1191 | | |
1192 | | /* |
1193 | | - nch - number of characters in a set |
1194 | | */ |
1195 | | static int |
1196 | | nch(struct parse *p, cset *cs) |
1197 | 2.95M | { |
1198 | 2.95M | size_t i; |
1199 | 2.95M | size_t css = (size_t)p->g->csetsize; |
1200 | 2.95M | int n = 0; |
1201 | 2.95M | |
1202 | 759M | for (i = 0; i < css759M ; i++756M ) |
1203 | 756M | if (756M CHIN756M (cs, i)) |
1204 | 40.1M | n++; |
1205 | 2.95M | return(n); |
1206 | 2.95M | } |
1207 | | |
1208 | | /* |
1209 | | - mcadd - add a collating element to a cset |
1210 | | */ |
1211 | | static void |
1212 | | mcadd( struct parse *p, cset *cs, const char *cp) |
1213 | 0 | { |
1214 | 0 | size_t oldend = cs->smultis; |
1215 | 0 | void *np; |
1216 | 0 |
|
1217 | 0 | cs->smultis += strlen(cp) + 1; |
1218 | 0 | np = realloc(cs->multis, cs->smultis); |
1219 | 0 | if (np == NULL0 ) { |
1220 | 0 | if (cs->multis) |
1221 | 0 | free(cs->multis); |
1222 | 0 | cs->multis = NULL; |
1223 | 0 | SETERROR(REG_ESPACE); |
1224 | 0 | return; |
1225 | 0 | } |
1226 | 0 | cs->multis = np; |
1227 | 0 |
|
1228 | 0 | llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); |
1229 | 0 | } |
1230 | | |
1231 | | /* |
1232 | | - mcinvert - invert the list of collating elements in a cset |
1233 | | * |
1234 | | * This would have to know the set of possibilities. Implementation |
1235 | | * is deferred. |
1236 | | */ |
1237 | | /* ARGSUSED */ |
1238 | | static void |
1239 | | mcinvert(struct parse *p, cset *cs) |
1240 | 0 | { |
1241 | 0 | assert(cs->multis == NULL); /* xxx */ |
1242 | 0 | } |
1243 | | |
1244 | | /* |
1245 | | - mccase - add case counterparts of the list of collating elements in a cset |
1246 | | * |
1247 | | * This would have to know the set of possibilities. Implementation |
1248 | | * is deferred. |
1249 | | */ |
1250 | | /* ARGSUSED */ |
1251 | | static void |
1252 | | mccase(struct parse *p, cset *cs) |
1253 | 0 | { |
1254 | 0 | assert(cs->multis == NULL); /* xxx */ |
1255 | 0 | } |
1256 | | |
1257 | | /* |
1258 | | - isinsets - is this character in any sets? |
1259 | | */ |
1260 | | static int /* predicate */ |
1261 | | isinsets(struct re_guts *g, int c) |
1262 | 232M | { |
1263 | 232M | uch *col; |
1264 | 232M | int i; |
1265 | 232M | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
1266 | 232M | unsigned uc = (uch)c; |
1267 | 232M | |
1268 | 450M | for (i = 0, col = g->setbits; i < ncols450M ; i++, col += g->csetsize217M ) |
1269 | 220M | if (220M col[uc] != 0220M ) |
1270 | 2.83M | return(1); |
1271 | 230M | return(0); |
1272 | 232M | } |
1273 | | |
1274 | | /* |
1275 | | - samesets - are these two characters in exactly the same sets? |
1276 | | */ |
1277 | | static int /* predicate */ |
1278 | | samesets(struct re_guts *g, int c1, int c2) |
1279 | 154M | { |
1280 | 154M | uch *col; |
1281 | 154M | int i; |
1282 | 154M | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
1283 | 154M | unsigned uc1 = (uch)c1; |
1284 | 154M | unsigned uc2 = (uch)c2; |
1285 | 154M | |
1286 | 177M | for (i = 0, col = g->setbits; i < ncols177M ; i++, col += g->csetsize22.1M ) |
1287 | 155M | if (155M col[uc1] != col[uc2]155M ) |
1288 | 133M | return(0); |
1289 | 21.7M | return(1); |
1290 | 154M | } |
1291 | | |
1292 | | /* |
1293 | | - categorize - sort out character categories |
1294 | | */ |
1295 | | static void |
1296 | | categorize(struct parse *p, struct re_guts *g) |
1297 | 1.07M | { |
1298 | 1.07M | cat_t *cats = g->categories; |
1299 | 1.07M | int c; |
1300 | 1.07M | int c2; |
1301 | 1.07M | cat_t cat; |
1302 | 1.07M | |
1303 | 1.07M | /* avoid making error situations worse */ |
1304 | 1.07M | if (p->error != 0) |
1305 | 32.3k | return; |
1306 | 1.03M | |
1307 | 266M | for (c = CHAR_MIN; 1.03M c <= CHAR_MAX266M ; c++265M ) |
1308 | 265M | if (265M cats[c] == 0 && 265M isinsets(g, c)232M ) { |
1309 | 2.83M | cat = g->ncategories++; |
1310 | 2.83M | cats[c] = cat; |
1311 | 223M | for (c2 = c+1; c2 <= CHAR_MAX223M ; c2++220M ) |
1312 | 220M | if (220M cats[c2] == 0 && 220M samesets(g, c, c2)154M ) |
1313 | 21.7M | cats[c2] = cat; |
1314 | 265M | } |
1315 | 1.07M | } |
1316 | | |
1317 | | /* |
1318 | | - dupl - emit a duplicate of a bunch of sops |
1319 | | */ |
1320 | | static sopno /* start of duplicate */ |
1321 | | dupl(struct parse *p, |
1322 | | sopno start, /* from here */ |
1323 | | sopno finish) /* to this less one */ |
1324 | 17 | { |
1325 | 17 | sopno ret = HERE(); |
1326 | 17 | sopno len = finish - start; |
1327 | 17 | |
1328 | 17 | assert(finish >= start); |
1329 | 17 | if (len == 0) |
1330 | 0 | return(ret); |
1331 | 17 | enlarge(p, p->ssize + len); /* this many unexpected additions */ |
1332 | 17 | assert(p->ssize >= p->slen + len); |
1333 | 17 | (void) memmove((char *)(p->strip + p->slen), |
1334 | 17 | (char *)(p->strip + start), (size_t)len*sizeof(sop)); |
1335 | 17 | p->slen += len; |
1336 | 17 | return(ret); |
1337 | 17 | } |
1338 | | |
1339 | | /* |
1340 | | - doemit - emit a strip operator |
1341 | | * |
1342 | | * It might seem better to implement this as a macro with a function as |
1343 | | * hard-case backup, but it's just too big and messy unless there are |
1344 | | * some changes to the data structures. Maybe later. |
1345 | | */ |
1346 | | static void |
1347 | | doemit(struct parse *p, sop op, size_t opnd) |
1348 | 38.2M | { |
1349 | 38.2M | /* avoid making error situations worse */ |
1350 | 38.2M | if (p->error != 0) |
1351 | 32.3k | return; |
1352 | 38.2M | |
1353 | 38.2M | /* deal with oversize operands ("can't happen", more or less) */ |
1354 | 38.2M | assert(opnd < 1<<OPSHIFT); |
1355 | 38.2M | |
1356 | 38.2M | /* deal with undersized strip */ |
1357 | 38.2M | if (p->slen >= p->ssize) |
1358 | 2.03k | enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ |
1359 | 38.2M | assert(p->slen < p->ssize); |
1360 | 38.2M | |
1361 | 38.2M | /* finally, it's all reduced to the easy case */ |
1362 | 38.2M | p->strip[p->slen++] = SOP(op, opnd); |
1363 | 38.2M | } |
1364 | | |
1365 | | /* |
1366 | | - doinsert - insert a sop into the strip |
1367 | | */ |
1368 | | static void |
1369 | | doinsert(struct parse *p, sop op, size_t opnd, sopno pos) |
1370 | 2.99M | { |
1371 | 2.99M | sopno sn; |
1372 | 2.99M | sop s; |
1373 | 2.99M | int i; |
1374 | 2.99M | |
1375 | 2.99M | /* avoid making error situations worse */ |
1376 | 2.99M | if (p->error != 0) |
1377 | 0 | return; |
1378 | 2.99M | |
1379 | 2.99M | sn = 2.99M HERE2.99M (); |
1380 | 2.99M | EMIT(op, opnd); /* do checks, ensure space */ |
1381 | 2.99M | assert(HERE() == sn+1); |
1382 | 2.99M | s = p->strip[sn]; |
1383 | 2.99M | |
1384 | 2.99M | /* adjust paren pointers */ |
1385 | 2.99M | assert(pos > 0); |
1386 | 29.9M | for (i = 1; i < 29.9M NPAREN29.9M ; i++26.9M ) { |
1387 | 26.9M | if (p->pbegin[i] >= pos26.9M ) { |
1388 | 63.0k | p->pbegin[i]++; |
1389 | 63.0k | } |
1390 | 26.9M | if (p->pend[i] >= pos26.9M ) { |
1391 | 63.0k | p->pend[i]++; |
1392 | 63.0k | } |
1393 | 26.9M | } |
1394 | 2.99M | |
1395 | 2.99M | memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], |
1396 | 2.99M | (HERE()-pos-1)*sizeof(sop)); |
1397 | 2.99M | p->strip[pos] = s; |
1398 | 2.99M | } |
1399 | | |
1400 | | /* |
1401 | | - dofwd - complete a forward reference |
1402 | | */ |
1403 | | static void |
1404 | | dofwd(struct parse *p, sopno pos, sop value) |
1405 | 2.39M | { |
1406 | 2.39M | /* avoid making error situations worse */ |
1407 | 2.39M | if (p->error != 0) |
1408 | 0 | return; |
1409 | 2.39M | |
1410 | 2.39M | assert(value < 1<<OPSHIFT); |
1411 | 2.39M | p->strip[pos] = OP(p->strip[pos]) | value; |
1412 | 2.39M | } |
1413 | | |
1414 | | /* |
1415 | | - enlarge - enlarge the strip |
1416 | | */ |
1417 | | static void |
1418 | | enlarge(struct parse *p, sopno size) |
1419 | 2.05k | { |
1420 | 2.05k | sop *sp; |
1421 | 2.05k | |
1422 | 2.05k | if (p->ssize >= size) |
1423 | 0 | return; |
1424 | 2.05k | |
1425 | 2.05k | if (2.05k (uintptr_t)size > SIZE_MAX / sizeof(sop)2.05k ) { |
1426 | 0 | SETERROR(REG_ESPACE); |
1427 | 0 | return; |
1428 | 0 | } |
1429 | 2.05k | |
1430 | 2.05k | sp = (sop *)realloc(p->strip, size*sizeof(sop)); |
1431 | 2.05k | if (sp == NULL2.05k ) { |
1432 | 0 | SETERROR(REG_ESPACE); |
1433 | 0 | return; |
1434 | 0 | } |
1435 | 2.05k | p->strip = sp; |
1436 | 2.05k | p->ssize = size; |
1437 | 2.05k | } |
1438 | | |
1439 | | /* |
1440 | | - stripsnug - compact the strip |
1441 | | */ |
1442 | | static void |
1443 | | stripsnug(struct parse *p, struct re_guts *g) |
1444 | 1.07M | { |
1445 | 1.07M | g->nstates = p->slen; |
1446 | 1.07M | if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)1.07M ) { |
1447 | 0 | g->strip = p->strip; |
1448 | 0 | SETERROR(REG_ESPACE); |
1449 | 0 | return; |
1450 | 0 | } |
1451 | 1.07M | |
1452 | 1.07M | g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); |
1453 | 1.07M | if (g->strip == NULL1.07M ) { |
1454 | 0 | SETERROR(REG_ESPACE); |
1455 | 0 | g->strip = p->strip; |
1456 | 0 | } |
1457 | 1.07M | } |
1458 | | |
1459 | | /* |
1460 | | - findmust - fill in must and mlen with longest mandatory literal string |
1461 | | * |
1462 | | * This algorithm could do fancy things like analyzing the operands of | |
1463 | | * for common subsequences. Someday. This code is simple and finds most |
1464 | | * of the interesting cases. |
1465 | | * |
1466 | | * Note that must and mlen got initialized during setup. |
1467 | | */ |
1468 | | static void |
1469 | | findmust(struct parse *p, struct re_guts *g) |
1470 | 1.07M | { |
1471 | 1.07M | sop *scan; |
1472 | 1.07M | sop *start = 0; /* start initialized in the default case, after that */ |
1473 | 1.07M | sop *newstart = 0; /* newstart was initialized in the OCHAR case */ |
1474 | 1.07M | sopno newlen; |
1475 | 1.07M | sop s; |
1476 | 1.07M | char *cp; |
1477 | 1.07M | sopno i; |
1478 | 1.07M | |
1479 | 1.07M | /* avoid making error situations worse */ |
1480 | 1.07M | if (p->error != 0) |
1481 | 32.3k | return; |
1482 | 1.03M | |
1483 | 1.03M | /* find the longest OCHAR sequence in strip */ |
1484 | 1.03M | newlen = 0; |
1485 | 1.03M | scan = g->strip + 1; |
1486 | 22.3M | do { |
1487 | 22.3M | s = *scan++; |
1488 | 22.3M | switch (OP(s)) { |
1489 | 13.1M | case 13.1M OCHAR13.1M : /* sequence member */ |
1490 | 13.1M | if (newlen == 0) /* new sequence */ |
1491 | 1.97M | newstart = scan - 1; |
1492 | 13.1M | newlen++; |
1493 | 13.1M | break; |
1494 | 2.18M | case 2.18M OPLUS_2.18M : /* things that don't break one */ |
1495 | 2.18M | case 2.18M OLPAREN2.18M : |
1496 | 2.18M | case 2.18M ORPAREN2.18M : |
1497 | 2.18M | break; |
1498 | 1.92M | case 1.92M OQUEST_1.92M : /* things that must be skipped */ |
1499 | 1.92M | case 1.92M OCH_1.92M : |
1500 | 1.92M | scan--; |
1501 | 3.18M | do { |
1502 | 3.18M | scan += OPND(s); |
1503 | 3.18M | s = *scan; |
1504 | 3.18M | /* assert() interferes w debug printouts */ |
1505 | 3.18M | if (OP3.18M (s) != 3.18M O_QUEST3.18M && OP2.29M (s) != 2.29M O_CH2.29M && |
1506 | 3.18M | OP1.26M (s) != 1.26M OOR21.26M ) { |
1507 | 0 | g->iflags |= REGEX_BAD; |
1508 | 0 | return; |
1509 | 0 | } |
1510 | 3.18M | } while (OP1.92M (s) != 3.18M O_QUEST3.18M && OP2.29M (s) != 2.29M O_CH2.29M ); |
1511 | 1.92M | /* fallthrough */ |
1512 | 7.01M | default: /* things that break a sequence */ |
1513 | 7.01M | if (newlen > g->mlen7.01M ) { /* ends one */ |
1514 | 1.01M | start = newstart; |
1515 | 1.01M | g->mlen = newlen; |
1516 | 1.01M | } |
1517 | 2.18M | newlen = 0; |
1518 | 2.18M | break; |
1519 | 22.3M | } |
1520 | 22.3M | } while (OP1.03M (s) != 22.3M OEND22.3M ); |
1521 | 1.03M | |
1522 | 1.03M | if (1.03M g->mlen == 01.03M ) /* there isn't one */ |
1523 | 25.5k | return; |
1524 | 1.01M | |
1525 | 1.01M | /* turn it into a character string */ |
1526 | 1.01M | g->must = malloc((size_t)g->mlen + 1); |
1527 | 1.01M | if (g->must == NULL1.01M ) { /* argh; just forget it */ |
1528 | 0 | g->mlen = 0; |
1529 | 0 | return; |
1530 | 0 | } |
1531 | 1.01M | cp = g->must; |
1532 | 1.01M | scan = start; |
1533 | 12.3M | for (i = g->mlen; i > 012.3M ; i--11.2M ) { |
1534 | 11.3M | while (OP11.3M (s = *scan++) != 11.3M OCHAR11.3M ) |
1535 | 74.2k | continue; |
1536 | 11.2M | assert(cp < g->must + g->mlen); |
1537 | 11.2M | *cp++ = (char)OPND(s); |
1538 | 11.2M | } |
1539 | 1.07M | assert(cp == g->must + g->mlen); |
1540 | 1.07M | *cp++ = '\0'; /* just on general principles */ |
1541 | 1.07M | } |
1542 | | |
1543 | | /* |
1544 | | - pluscount - count + nesting |
1545 | | */ |
1546 | | static sopno /* nesting depth */ |
1547 | | pluscount(struct parse *p, struct re_guts *g) |
1548 | 1.07M | { |
1549 | 1.07M | sop *scan; |
1550 | 1.07M | sop s; |
1551 | 1.07M | sopno plusnest = 0; |
1552 | 1.07M | sopno maxnest = 0; |
1553 | 1.07M | |
1554 | 1.07M | if (p->error != 0) |
1555 | 32.3k | return(0); /* there may not be an OEND */ |
1556 | 1.03M | |
1557 | 1.03M | scan = g->strip + 1; |
1558 | 37.1M | do { |
1559 | 37.1M | s = *scan++; |
1560 | 37.1M | switch (OP(s)) { |
1561 | 995k | case 995k OPLUS_995k : |
1562 | 995k | plusnest++; |
1563 | 995k | break; |
1564 | 995k | case 995k O_PLUS995k : |
1565 | 995k | if (plusnest > maxnest) |
1566 | 912k | maxnest = plusnest; |
1567 | 995k | plusnest--; |
1568 | 995k | break; |
1569 | 37.1M | } |
1570 | 37.1M | } while (OP1.03M (s) != 37.1M OEND37.1M ); |
1571 | 1.03M | if (1.03M plusnest != 01.03M ) |
1572 | 0 | g->iflags |= 0 REGEX_BAD0 ; |
1573 | 1.03M | return(maxnest); |
1574 | 1.07M | } |