Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Support/regcomp.c
Line
Count
Source (jump to first uncovered line)
1
/*-
2
 * This code is derived from OpenBSD's libc/regex, original license follows:
3
 *
4
 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5
 * Copyright (c) 1992, 1993, 1994
6
 *  The Regents of the University of California.  All rights reserved.
7
 *
8
 * This code is derived from software contributed to Berkeley by
9
 * Henry Spencer.
10
 *
11
 * Redistribution and use in source and binary forms, with or without
12
 * modification, are permitted provided that the following conditions
13
 * are met:
14
 * 1. Redistributions of source code must retain the above copyright
15
 *    notice, this list of conditions and the following disclaimer.
16
 * 2. Redistributions in binary form must reproduce the above copyright
17
 *    notice, this list of conditions and the following disclaimer in the
18
 *    documentation and/or other materials provided with the distribution.
19
 * 3. Neither the name of the University nor the names of its contributors
20
 *    may be used to endorse or promote products derived from this software
21
 *    without specific prior written permission.
22
 *
23
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33
 * SUCH DAMAGE.
34
 *
35
 *  @(#)regcomp.c 8.5 (Berkeley) 3/20/94
36
 */
37
38
#include <sys/types.h>
39
#include <stdint.h>
40
#include <stdio.h>
41
#include <string.h>
42
#include <ctype.h>
43
#include <limits.h>
44
#include <stdlib.h>
45
#include "regex_impl.h"
46
47
#include "regutils.h"
48
#include "regex2.h"
49
50
#include "llvm/Config/config.h"
51
52
/* character-class table */
53
static struct cclass {
54
  const char *name;
55
  const char *chars;
56
  const char *multis;
57
} cclasses[] = {
58
  { "alnum",  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
59
0123456789",        ""} ,
60
  { "alpha",  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
61
          ""} ,
62
  { "blank",  " \t",    ""} ,
63
  { "cntrl",  "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
64
\25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
65
  { "digit",  "0123456789", ""} ,
66
  { "graph",  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
67
0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
68
          ""} ,
69
  { "lower",  "abcdefghijklmnopqrstuvwxyz",
70
          ""} ,
71
  { "print",  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
72
0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
73
          ""} ,
74
  { "punct",  "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
75
          ""} ,
76
  { "space",  "\t\n\v\f\r ",  ""} ,
77
  { "upper",  "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
78
          ""} ,
79
  { "xdigit", "0123456789ABCDEFabcdef",
80
          ""} ,
81
  { NULL,   0,    "" }
82
};
83
84
/* character-name table */
85
static struct cname {
86
  const char *name;
87
  char code;
88
} cnames[] = {
89
  { "NUL",      '\0' },
90
  { "SOH",      '\001' },
91
  { "STX",      '\002' },
92
  { "ETX",      '\003' },
93
  { "EOT",      '\004' },
94
  { "ENQ",      '\005' },
95
  { "ACK",      '\006' },
96
  { "BEL",      '\007' },
97
  { "alert",      '\007' },
98
  { "BS",       '\010' },
99
  { "backspace",      '\b' },
100
  { "HT",       '\011' },
101
  { "tab",      '\t' },
102
  { "LF",       '\012' },
103
  { "newline",      '\n' },
104
  { "VT",       '\013' },
105
  { "vertical-tab",   '\v' },
106
  { "FF",       '\014' },
107
  { "form-feed",      '\f' },
108
  { "CR",       '\015' },
109
  { "carriage-return",    '\r' },
110
  { "SO",       '\016' },
111
  { "SI",       '\017' },
112
  { "DLE",      '\020' },
113
  { "DC1",      '\021' },
114
  { "DC2",      '\022' },
115
  { "DC3",      '\023' },
116
  { "DC4",      '\024' },
117
  { "NAK",      '\025' },
118
  { "SYN",      '\026' },
119
  { "ETB",      '\027' },
120
  { "CAN",      '\030' },
121
  { "EM",       '\031' },
122
  { "SUB",      '\032' },
123
  { "ESC",      '\033' },
124
  { "IS4",      '\034' },
125
  { "FS",       '\034' },
126
  { "IS3",      '\035' },
127
  { "GS",       '\035' },
128
  { "IS2",      '\036' },
129
  { "RS",       '\036' },
130
  { "IS1",      '\037' },
131
  { "US",       '\037' },
132
  { "space",      ' ' },
133
  { "exclamation-mark",   '!' },
134
  { "quotation-mark",   '"' },
135
  { "number-sign",    '#' },
136
  { "dollar-sign",    '$' },
137
  { "percent-sign",   '%' },
138
  { "ampersand",      '&' },
139
  { "apostrophe",     '\'' },
140
  { "left-parenthesis",   '(' },
141
  { "right-parenthesis",    ')' },
142
  { "asterisk",     '*' },
143
  { "plus-sign",      '+' },
144
  { "comma",      ',' },
145
  { "hyphen",     '-' },
146
  { "hyphen-minus",   '-' },
147
  { "period",     '.' },
148
  { "full-stop",      '.' },
149
  { "slash",      '/' },
150
  { "solidus",      '/' },
151
  { "zero",     '0' },
152
  { "one",      '1' },
153
  { "two",      '2' },
154
  { "three",      '3' },
155
  { "four",     '4' },
156
  { "five",     '5' },
157
  { "six",      '6' },
158
  { "seven",      '7' },
159
  { "eight",      '8' },
160
  { "nine",     '9' },
161
  { "colon",      ':' },
162
  { "semicolon",      ';' },
163
  { "less-than-sign",   '<' },
164
  { "equals-sign",    '=' },
165
  { "greater-than-sign",    '>' },
166
  { "question-mark",    '?' },
167
  { "commercial-at",    '@' },
168
  { "left-square-bracket",  '[' },
169
  { "backslash",      '\\' },
170
  { "reverse-solidus",    '\\' },
171
  { "right-square-bracket", ']' },
172
  { "circumflex",     '^' },
173
  { "circumflex-accent",    '^' },
174
  { "underscore",     '_' },
175
  { "low-line",     '_' },
176
  { "grave-accent",   '`' },
177
  { "left-brace",     '{' },
178
  { "left-curly-bracket",   '{' },
179
  { "vertical-line",    '|' },
180
  { "right-brace",    '}' },
181
  { "right-curly-bracket",  '}' },
182
  { "tilde",      '~' },
183
  { "DEL",      '\177' },
184
  { NULL,       0 }
185
};
186
187
/*
188
 * parse structure, passed up and down to avoid global variables and
189
 * other clumsinesses
190
 */
191
struct parse {
192
  char *next;   /* next character in RE */
193
  char *end;    /* end of string (-> NUL normally) */
194
  int error;    /* has an error been seen? */
195
  sop *strip;   /* malloced strip */
196
  sopno ssize;    /* malloced strip size (allocated) */
197
  sopno slen;   /* malloced strip length (used) */
198
  int ncsalloc;   /* number of csets allocated */
199
  struct re_guts *g;
200
24.4M
# define  NPAREN  10  /* we need to remember () 1-9 for back refs */
201
  sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
202
  sopno pend[NPAREN]; /* -> ) ([0] unused) */
203
};
204
205
static void p_ere(struct parse *, int);
206
static void p_ere_exp(struct parse *);
207
static void p_str(struct parse *);
208
static void p_bre(struct parse *, int, int);
209
static int p_simp_re(struct parse *, int);
210
static int p_count(struct parse *);
211
static void p_bracket(struct parse *);
212
static void p_b_term(struct parse *, cset *);
213
static void p_b_cclass(struct parse *, cset *);
214
static void p_b_eclass(struct parse *, cset *);
215
static char p_b_symbol(struct parse *);
216
static char p_b_coll_elem(struct parse *, int);
217
static char othercase(int);
218
static void bothcases(struct parse *, int);
219
static void ordinary(struct parse *, int);
220
static void nonnewline(struct parse *);
221
static void repeat(struct parse *, sopno, int, int);
222
static int seterr(struct parse *, int);
223
static cset *allocset(struct parse *);
224
static void freeset(struct parse *, cset *);
225
static int freezeset(struct parse *, cset *);
226
static int firstch(struct parse *, cset *);
227
static int nch(struct parse *, cset *);
228
static void mcadd(struct parse *, cset *, const char *);
229
static void mcinvert(struct parse *, cset *);
230
static void mccase(struct parse *, cset *);
231
static int isinsets(struct re_guts *, int);
232
static int samesets(struct re_guts *, int, int);
233
static void categorize(struct parse *, struct re_guts *);
234
static sopno dupl(struct parse *, sopno, sopno);
235
static void doemit(struct parse *, sop, size_t);
236
static void doinsert(struct parse *, sop, size_t, sopno);
237
static void dofwd(struct parse *, sopno, sop);
238
static void enlarge(struct parse *, sopno);
239
static void stripsnug(struct parse *, struct re_guts *);
240
static void findmust(struct parse *, struct re_guts *);
241
static sopno pluscount(struct parse *, struct re_guts *);
242
243
static char nuls[10];   /* place to point scanner in event of error */
244
245
/*
246
 * macros for use with parse structure
247
 * BEWARE:  these know that the parse structure is named `p' !!!
248
 */
249
70.7M
#define PEEK()  (*p->next)
250
965k
#define PEEK2() (*(p->next+1))
251
123M
#define MORE()  (p->next < p->end)
252
42.7M
#define MORE2() (p->next+1 < p->end)
253
19.5M
#define SEE(c)  (
MORE14.8M
() &&
PEEK14.1M
() == (c)14.1M
)
254
10.4M
#define SEETWO(a, b)  (MORE() && MORE2() && 
PEEK10.4M
() == (a)10.4M
&&
PEEK25
() == (b)5
)
255
9.40M
#define EAT(c)  ((SEE(c)) ? 
(802k
NEXT802k
(), 1) :
08.60M
)
256
5.69M
#define EATTWO(a, b)  ((SEETWO(a, b)) ? 
(0
NEXT20
(), 1) : 0)
257
2.82M
#define NEXT()  (p->next++)
258
513
#define NEXT2() (p->next += 2)
259
0
#define NEXTn(n)  (p->next += (n))
260
23.9M
#define GETNEXT() (*p->next++)
261
61.0k
#define SETERROR(e) seterr(p, (e))
262
17.4M
#define REQUIRE(co, e)  (void)((
co4.49M
) ||
SETERROR61.0k
(e))
263
#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
264
2.24M
#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
265
#define MUSTNOTSEE(c, e)  (REQUIRE(!MORE() || PEEK() != (c), e))
266
23.6M
#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
267
1.54M
#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
268
1.36M
#define AHEAD(pos)    dofwd(p, pos, HERE()-(pos))
269
2.35M
#define ASTERN(sop, pos)  EMIT(sop, HERE()-pos)
270
25.6M
#define HERE()    (p->slen)
271
2.17M
#define THERE()   (p->slen - 1)
272
#define THERETHERE()  (p->slen - 2)
273
0
#define DROP(n) (p->slen -= (n))
274
275
#ifdef  _POSIX2_RE_DUP_MAX
276
24
#define DUPMAX  _POSIX2_RE_DUP_MAX
277
#else
278
#define DUPMAX  255
279
#endif
280
0
#define INFINITY  (DUPMAX + 1)
281
282
#ifndef NDEBUG
283
static int never = 0;   /* for use in asserts; shuts lint up */
284
#else
285
#define never 0   /* some <assert.h>s have bugs too */
286
#endif
287
288
/*
289
 - llvm_regcomp - interface for parser and compilation
290
 */
291
int       /* 0 success, otherwise REG_something */
292
llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
293
689k
{
294
689k
  struct parse pa;
295
689k
  struct re_guts *g;
296
689k
  struct parse *p = &pa;
297
689k
  int i;
298
689k
  size_t len;
299
#ifdef REDEBUG
300
# define  GOODFLAGS(f)  (f)
301
#else
302
689k
# define  GOODFLAGS(f)  ((f)&~REG_DUMP)
303
689k
#endif
304
689k
305
689k
  cflags = GOODFLAGS(cflags);
306
689k
  if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
307
0
    return(REG_INVARG);
308
689k
309
689k
  if (cflags&REG_PEND) {
310
689k
    if (preg->re_endp < pattern)
311
0
      return(REG_INVARG);
312
689k
    len = preg->re_endp - pattern;
313
689k
  } else
314
0
    len = strlen((const char *)pattern);
315
689k
316
689k
  /* do the mallocs early so failure handling is easy */
317
689k
  g = (struct re_guts *)malloc(sizeof(struct re_guts) +
318
689k
              (NC-1)*sizeof(cat_t));
319
689k
  if (g == NULL)
320
689k
    
return(0
REG_ESPACE0
);
321
689k
  p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
322
689k
  p->strip = (sop *)calloc(p->ssize, sizeof(sop));
323
689k
  p->slen = 0;
324
689k
  if (p->strip == NULL) {
325
0
    free((char *)g);
326
0
    return(REG_ESPACE);
327
0
  }
328
689k
329
689k
  /* set things up */
330
689k
  p->g = g;
331
689k
  p->next = (char *)pattern;  /* convenience; we do not modify it */
332
689k
  p->end = p->next + len;
333
689k
  p->error = 0;
334
689k
  p->ncsalloc = 0;
335
7.58M
  for (i = 0; i < NPAREN; 
i++6.89M
) {
336
6.89M
    p->pbegin[i] = 0;
337
6.89M
    p->pend[i] = 0;
338
6.89M
  }
339
689k
  g->csetsize = NC;
340
689k
  g->sets = NULL;
341
689k
  g->setbits = NULL;
342
689k
  g->ncsets = 0;
343
689k
  g->cflags = cflags;
344
689k
  g->iflags = 0;
345
689k
  g->nbol = 0;
346
689k
  g->neol = 0;
347
689k
  g->must = NULL;
348
689k
  g->mlen = 0;
349
689k
  g->nsub = 0;
350
689k
  g->ncategories = 1; /* category 0 is "everything else" */
351
689k
  g->categories = &g->catspace[-(CHAR_MIN)];
352
689k
  (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
353
689k
  g->backrefs = 0;
354
689k
355
689k
  /* do it */
356
689k
  EMIT(OEND, 0);
357
689k
  g->firststate = THERE();
358
689k
  if (cflags&REG_EXTENDED)
359
689k
    p_ere(p, OUT);
360
0
  else if (cflags&REG_NOSPEC)
361
0
    p_str(p);
362
0
  else
363
0
    p_bre(p, OUT, OUT);
364
689k
  EMIT(OEND, 0);
365
689k
  g->laststate = THERE();
366
689k
367
689k
  /* tidy up loose ends and fill things in */
368
689k
  categorize(p, g);
369
689k
  stripsnug(p, g);
370
689k
  findmust(p, g);
371
689k
  g->nplus = pluscount(p, g);
372
689k
  g->magic = MAGIC2;
373
689k
  preg->re_nsub = g->nsub;
374
689k
  preg->re_g = g;
375
689k
  preg->re_magic = MAGIC1;
376
689k
#ifndef REDEBUG
377
689k
  /* not debugging, so can't rely on the assert() in llvm_regexec() */
378
689k
  if (g->iflags&REGEX_BAD)
379
689k
    
SETERROR0
(REG_ASSERT);
380
689k
#endif
381
689k
382
689k
  /* win or lose, we're done */
383
689k
  if (p->error != 0)  /* lose */
384
61.0k
    llvm_regfree(preg);
385
689k
  return(p->error);
386
689k
}
387
388
/*
389
 - p_ere - ERE parser top level, concatenation and alternation
390
 */
391
static void
392
p_ere(struct parse *p, int stop)  /* character this ERE should end at */
393
1.36M
{
394
1.36M
  char c;
395
1.36M
  sopno prevback = 0;
396
1.36M
  sopno prevfwd = 0;
397
1.36M
  sopno conc;
398
1.36M
  int first = 1;    /* is this the first alternative? */
399
1.36M
400
2.16M
  for (;;) {
401
2.16M
    /* do a bunch of concatenated expressions */
402
2.16M
    conc = HERE();
403
19.0M
    while (MORE() && 
(c = 18.3M
PEEK18.3M
()) != '|' &&
c != stop17.5M
)
404
16.8M
      p_ere_exp(p);
405
2.16M
    REQUIRE(HERE() != conc, REG_EMPTY);  /* require nonempty */
406
2.16M
407
2.16M
    if (!EAT('|'))
408
2.16M
      
break1.36M
; /* NOTE BREAK OUT */
409
800k
410
800k
    if (first) {
411
551k
      INSERT(OCH_, conc); /* offset is wrong */
412
551k
      prevfwd = conc;
413
551k
      prevback = conc;
414
551k
      first = 0;
415
551k
    }
416
800k
    ASTERN(OOR1, prevback);
417
800k
    prevback = THERE();
418
800k
    AHEAD(prevfwd);      /* fix previous offset */
419
800k
    prevfwd = HERE();
420
800k
    EMIT(OOR2, 0);      /* offset is very wrong */
421
800k
  }
422
1.36M
423
1.36M
  if (!first) {   /* tail-end fixups */
424
551k
    AHEAD(prevfwd);
425
551k
    ASTERN(O_CH, prevback);
426
551k
  }
427
1.36M
428
1.36M
  assert(!MORE() || SEE(stop));
429
1.36M
}
430
431
/*
432
 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
433
 */
434
static void
435
p_ere_exp(struct parse *p)
436
16.8M
{
437
16.8M
  char c;
438
16.8M
  sopno pos;
439
16.8M
  int count;
440
16.8M
  int count2;
441
16.8M
  int backrefnum;
442
16.8M
  sopno subno;
443
16.8M
  int wascaret = 0;
444
16.8M
445
16.8M
  assert(MORE());   /* caller should have ensured this */
446
16.8M
  c = GETNEXT();
447
16.8M
448
16.8M
  pos = HERE();
449
16.8M
  switch (c) {
450
16.8M
  case '(':
451
678k
    REQUIRE(MORE(), REG_EPAREN);
452
678k
    p->g->nsub++;
453
678k
    subno = p->g->nsub;
454
678k
    if (subno < NPAREN)
455
678k
      
p->pbegin[subno] = 678k
HERE678k
();
456
678k
    EMIT(OLPAREN, subno);
457
678k
    if (!SEE(')'))
458
678k
      p_ere(p, ')');
459
678k
    if (subno < NPAREN) {
460
678k
      p->pend[subno] = HERE();
461
678k
      assert(p->pend[subno] != 0);
462
678k
    }
463
678k
    EMIT(ORPAREN, subno);
464
678k
    MUSTEAT(')', REG_EPAREN);
465
678k
    break;
466
16.8M
#ifndef POSIX_MISTAKE
467
16.8M
  case ')':   /* happens only if no current unmatched ( */
468
0
    /*
469
0
     * You may ask, why the ifndef?  Because I didn't notice
470
0
     * this until slightly too late for 1003.2, and none of the
471
0
     * other 1003.2 regular-expression reviewers noticed it at
472
0
     * all.  So an unmatched ) is legal POSIX, at least until
473
0
     * we can get it fixed.
474
0
     */
475
0
    SETERROR(REG_EPAREN);
476
0
    break;
477
16.8M
#endif
478
16.8M
  case '^':
479
622k
    EMIT(OBOL, 0);
480
622k
    p->g->iflags |= USEBOL;
481
622k
    p->g->nbol++;
482
622k
    wascaret = 1;
483
622k
    break;
484
16.8M
  case '$':
485
545k
    EMIT(OEOL, 0);
486
545k
    p->g->iflags |= USEEOL;
487
545k
    p->g->neol++;
488
545k
    break;
489
16.8M
  case '|':
490
0
    SETERROR(REG_EMPTY);
491
0
    break;
492
16.8M
  case '*':
493
5
  case '+':
494
5
  case '?':
495
5
    SETERROR(REG_BADRPT);
496
5
    break;
497
80.9k
  case '.':
498
80.9k
    if (p->g->cflags&REG_NEWLINE)
499
80.9k
      
nonnewline(p)0
;
500
80.9k
    else
501
80.9k
      EMIT(OANY, 0);
502
80.9k
    break;
503
1.53M
  case '[':
504
1.53M
    p_bracket(p);
505
1.53M
    break;
506
1.39M
  case '\\':
507
1.39M
    REQUIRE(MORE(), REG_EESCAPE);
508
1.39M
    c = GETNEXT();
509
1.39M
    if (c >= '1' && 
c <= '9'50
) {
510
5
      /* \[0-9] is taken to be a back-reference to a previously specified
511
5
       * matching group. backrefnum will hold the number. The matching
512
5
       * group must exist (i.e. if \4 is found there must have been at
513
5
       * least 4 matching groups specified in the pattern previously).
514
5
       */
515
5
      backrefnum = c - '0';
516
5
      if (p->pend[backrefnum] == 0) {
517
0
        SETERROR(REG_ESUBREG);
518
0
        break;
519
0
      }
520
5
521
5
      /* Make sure everything checks out and emit the sequence
522
5
       * that marks a back-reference to the parse structure.
523
5
       */
524
5
      assert(backrefnum <= p->g->nsub);
525
5
      EMIT(OBACK_, backrefnum);
526
5
      assert(p->pbegin[backrefnum] != 0);
527
5
      assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
528
5
      assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
529
5
      (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
530
5
      EMIT(O_BACK, backrefnum);
531
5
      p->g->backrefs = 1;
532
1.39M
    } else {
533
1.39M
      /* Other chars are simply themselves when escaped with a backslash.
534
1.39M
       */
535
1.39M
      ordinary(p, c);
536
1.39M
    }
537
1.39M
    break;
538
1.39M
  case '{':   /* okay as ordinary except if digit follows */
539
0
    REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
540
0
    /* FALLTHROUGH */
541
12.0M
  default:
542
12.0M
    ordinary(p, c);
543
12.0M
    break;
544
16.8M
  }
545
16.8M
546
16.8M
  if (!MORE())
547
16.8M
    
return601k
;
548
16.2M
  c = PEEK();
549
16.2M
  /* we call { a repetition if followed by a digit */
550
16.2M
  if (!( c == '*' || 
c == '+'15.8M
||
c == '?'15.7M
||
551
16.2M
        
(15.7M
c == '{'15.7M
&&
MORE212
() &&
isdigit((uch)12
PEEK212
())) ))
552
15.7M
    return;   /* no repetition, we're done */
553
527k
  NEXT();
554
527k
555
527k
  REQUIRE(!wascaret, REG_BADRPT);
556
527k
  switch (c) {
557
527k
  case '*': /* implemented as +? */
558
467k
    /* this case does not require the (y|) trick, noKLUDGE */
559
467k
    INSERT(OPLUS_, pos);
560
467k
    ASTERN(O_PLUS, pos);
561
467k
    INSERT(OQUEST_, pos);
562
467k
    ASTERN(O_QUEST, pos);
563
467k
    break;
564
527k
  case '+':
565
52.2k
    INSERT(OPLUS_, pos);
566
52.2k
    ASTERN(O_PLUS, pos);
567
52.2k
    break;
568
527k
  case '?':
569
7.69k
    /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
570
7.69k
    INSERT(OCH_, pos);    /* offset slightly wrong */
571
7.69k
    ASTERN(OOR1, pos);    /* this one's right */
572
7.69k
    AHEAD(pos);      /* fix the OCH_ */
573
7.69k
    EMIT(OOR2, 0);      /* offset very wrong... */
574
7.69k
    AHEAD(THERE());      /* ...so fix it */
575
7.69k
    ASTERN(O_CH, THERETHERE());
576
7.69k
    break;
577
527k
  case '{':
578
12
    count = p_count(p);
579
12
    if (EAT(',')) {
580
12
      if (isdigit((uch)PEEK())) {
581
12
        count2 = p_count(p);
582
12
        REQUIRE(count <= count2, REG_BADBR);
583
12
      } else   /* single number with comma */
584
0
        count2 = INFINITY;
585
12
    } else   /* just a single number */
586
0
      count2 = count;
587
12
    repeat(p, pos, count, count2);
588
12
    if (!EAT('}')) { /* error heuristics */
589
0
      while (MORE() && PEEK() != '}')
590
0
        NEXT();
591
0
      REQUIRE(MORE(), REG_EBRACE);
592
0
      SETERROR(REG_BADBR);
593
0
    }
594
12
    break;
595
527k
  }
596
527k
597
527k
  if (!MORE())
598
527k
    
return26.7k
;
599
500k
  c = PEEK();
600
500k
  if (!( c == '*' || c == '+' || c == '?' ||
601
500k
        (c == '{' && 
MORE20
() &&
isdigit((uch)0
PEEK20
())) ) )
602
500k
    return;
603
0
  SETERROR(REG_BADRPT);
604
0
}
605
606
/*
607
 - p_str - string (no metacharacters) "parser"
608
 */
609
static void
610
p_str(struct parse *p)
611
0
{
612
0
  REQUIRE(MORE(), REG_EMPTY);
613
0
  while (MORE())
614
0
    ordinary(p, GETNEXT());
615
0
}
616
617
/*
618
 - p_bre - BRE parser top level, anchoring and concatenation
619
 * Giving end1 as OUT essentially eliminates the end1/end2 check.
620
 *
621
 * This implementation is a bit of a kludge, in that a trailing $ is first
622
 * taken as an ordinary character and then revised to be an anchor.  The
623
 * only undesirable side effect is that '$' gets included as a character
624
 * category in such cases.  This is fairly harmless; not worth fixing.
625
 * The amount of lookahead needed to avoid this kludge is excessive.
626
 */
627
static void
628
p_bre(struct parse *p,
629
    int end1,   /* first terminating character */
630
    int end2)   /* second terminating character */
631
0
{
632
0
  sopno start = HERE();
633
0
  int first = 1;      /* first subexpression? */
634
0
  int wasdollar = 0;
635
0
636
0
  if (EAT('^')) {
637
0
    EMIT(OBOL, 0);
638
0
    p->g->iflags |= USEBOL;
639
0
    p->g->nbol++;
640
0
  }
641
0
  while (MORE() && !SEETWO(end1, end2)) {
642
0
    wasdollar = p_simp_re(p, first);
643
0
    first = 0;
644
0
  }
645
0
  if (wasdollar) { /* oops, that was a trailing anchor */
646
0
    DROP(1);
647
0
    EMIT(OEOL, 0);
648
0
    p->g->iflags |= USEEOL;
649
0
    p->g->neol++;
650
0
  }
651
0
652
0
  REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
653
0
}
654
655
/*
656
 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
657
 */
658
static int      /* was the simple RE an unbackslashed $? */
659
p_simp_re(struct parse *p,
660
    int starordinary)   /* is a leading * an ordinary character? */
661
0
{
662
0
  int c;
663
0
  int count;
664
0
  int count2;
665
0
  sopno pos;
666
0
  int i;
667
0
  sopno subno;
668
0
# define  BACKSL  (1<<CHAR_BIT)
669
0
670
0
        pos = HERE(); /* repetition op, if any, covers from here */
671
0
672
0
        assert(MORE()); /* caller should have ensured this */
673
0
        c = GETNEXT();
674
0
  if (c == '\\') {
675
0
    REQUIRE(MORE(), REG_EESCAPE);
676
0
    c = BACKSL | GETNEXT();
677
0
  }
678
0
  switch (c) {
679
0
  case '.':
680
0
    if (p->g->cflags&REG_NEWLINE)
681
0
      nonnewline(p);
682
0
    else
683
0
      EMIT(OANY, 0);
684
0
    break;
685
0
  case '[':
686
0
    p_bracket(p);
687
0
    break;
688
0
  case BACKSL|'{':
689
0
    SETERROR(REG_BADRPT);
690
0
    break;
691
0
  case BACKSL|'(':
692
0
    p->g->nsub++;
693
0
    subno = p->g->nsub;
694
0
    if (subno < NPAREN)
695
0
      p->pbegin[subno] = HERE();
696
0
    EMIT(OLPAREN, subno);
697
0
    /* the MORE here is an error heuristic */
698
0
    if (MORE() && !SEETWO('\\', ')'))
699
0
      p_bre(p, '\\', ')');
700
0
    if (subno < NPAREN) {
701
0
      p->pend[subno] = HERE();
702
0
      assert(p->pend[subno] != 0);
703
0
    }
704
0
    EMIT(ORPAREN, subno);
705
0
    REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
706
0
    break;
707
0
  case BACKSL|')': /* should not get here -- must be user */
708
0
  case BACKSL|'}':
709
0
    SETERROR(REG_EPAREN);
710
0
    break;
711
0
  case BACKSL|'1':
712
0
  case BACKSL|'2':
713
0
  case BACKSL|'3':
714
0
  case BACKSL|'4':
715
0
  case BACKSL|'5':
716
0
  case BACKSL|'6':
717
0
  case BACKSL|'7':
718
0
  case BACKSL|'8':
719
0
  case BACKSL|'9':
720
0
    i = (c&~BACKSL) - '0';
721
0
    assert(i < NPAREN);
722
0
    if (p->pend[i] != 0) {
723
0
      assert(i <= p->g->nsub);
724
0
      EMIT(OBACK_, i);
725
0
      assert(p->pbegin[i] != 0);
726
0
      assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
727
0
      assert(OP(p->strip[p->pend[i]]) == ORPAREN);
728
0
      (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
729
0
      EMIT(O_BACK, i);
730
0
    } else
731
0
      SETERROR(REG_ESUBREG);
732
0
    p->g->backrefs = 1;
733
0
    break;
734
0
  case '*':
735
0
    REQUIRE(starordinary, REG_BADRPT);
736
0
    /* FALLTHROUGH */
737
0
  default:
738
0
    ordinary(p, (char)c);
739
0
    break;
740
0
  }
741
0
742
0
  if (EAT('*')) {   /* implemented as +? */
743
0
    /* this case does not require the (y|) trick, noKLUDGE */
744
0
    INSERT(OPLUS_, pos);
745
0
    ASTERN(O_PLUS, pos);
746
0
    INSERT(OQUEST_, pos);
747
0
    ASTERN(O_QUEST, pos);
748
0
  } else if (EATTWO('\\', '{')) {
749
0
    count = p_count(p);
750
0
    if (EAT(',')) {
751
0
      if (MORE() && isdigit((uch)PEEK())) {
752
0
        count2 = p_count(p);
753
0
        REQUIRE(count <= count2, REG_BADBR);
754
0
      } else    /* single number with comma */
755
0
        count2 = INFINITY;
756
0
    } else    /* just a single number */
757
0
      count2 = count;
758
0
    repeat(p, pos, count, count2);
759
0
    if (!EATTWO('\\', '}')) { /* error heuristics */
760
0
      while (MORE() && !SEETWO('\\', '}'))
761
0
        NEXT();
762
0
      REQUIRE(MORE(), REG_EBRACE);
763
0
      SETERROR(REG_BADBR);
764
0
    }
765
0
  } else if (c == '$') /* $ (but not \$) ends it */
766
0
    return(1);
767
0
768
0
  return(0);
769
0
}
770
771
/*
772
 - p_count - parse a repetition count
773
 */
774
static int      /* the value */
775
p_count(struct parse *p)
776
24
{
777
24
  int count = 0;
778
24
  int ndigits = 0;
779
24
780
48
  while (MORE() && isdigit((uch)PEEK()) && 
count <= 24
DUPMAX24
) {
781
24
    count = count*10 + (GETNEXT() - '0');
782
24
    ndigits++;
783
24
  }
784
24
785
24
  REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
786
24
  return(count);
787
24
}
788
789
/*
790
 - p_bracket - parse a bracketed character list
791
 *
792
 * Note a significant property of this code:  if the allocset() did SETERROR,
793
 * no set operations are done.
794
 */
795
static void
796
p_bracket(struct parse *p)
797
1.56M
{
798
1.56M
  cset *cs;
799
1.56M
  int invert = 0;
800
1.56M
801
1.56M
  /* Dept of Truly Sickening Special-Case Kludges */
802
1.56M
  if (p->next + 5 < p->end && 
strncmp(p->next, "[:<:]]", 6) == 01.50M
) {
803
0
    EMIT(OBOW, 0);
804
0
    NEXTn(6);
805
0
    return;
806
0
  }
807
1.56M
  if (p->next + 5 < p->end && 
strncmp(p->next, "[:>:]]", 6) == 01.50M
) {
808
0
    EMIT(OEOW, 0);
809
0
    NEXTn(6);
810
0
    return;
811
0
  }
812
1.56M
813
1.56M
  if ((cs = allocset(p)) == NULL) {
814
0
    /* allocset did set error status in p */
815
0
    return;
816
0
  }
817
1.56M
818
1.56M
  if (EAT('^'))
819
1.56M
    
invert++1.84k
; /* make note to invert set at end */
820
1.56M
  if (EAT(']'))
821
1.56M
    
CHadd0
(cs, ']');
822
1.56M
  else if (EAT('-'))
823
1.56M
    
CHadd31
(cs, '-');
824
6.29M
  while (MORE() && 
PEEK6.29M
() != ']'6.29M
&&
!4.72M
SEETWO4.72M
('-', ']'))
825
4.72M
    p_b_term(p, cs);
826
1.56M
  if (EAT('-'))
827
1.56M
    
CHadd0
(cs, '-');
828
1.56M
  MUSTEAT(']', REG_EBRACK);
829
1.56M
830
1.56M
  if (p->error != 0) { /* don't mess things up further */
831
4
    freeset(p, cs);
832
4
    return;
833
4
  }
834
1.56M
835
1.56M
  if (p->g->cflags&REG_ICASE) {
836
37.1k
    int i;
837
37.1k
    int ci;
838
37.1k
839
9.54M
    for (i = p->g->csetsize - 1; i >= 0; 
i--9.50M
)
840
9.50M
      if (CHIN(cs, i) && 
isalpha(i)101k
) {
841
95.6k
        ci = othercase(i);
842
95.6k
        if (ci != i)
843
95.6k
          CHadd(cs, ci);
844
95.6k
      }
845
37.1k
    if (cs->multis != NULL)
846
37.1k
      
mccase(p, cs)0
;
847
37.1k
  }
848
1.56M
  if (invert) {
849
1.84k
    int i;
850
1.84k
851
475k
    for (i = p->g->csetsize - 1; i >= 0; 
i--473k
)
852
473k
      if (CHIN(cs, i))
853
473k
        
CHsub4.44k
(cs, i);
854
473k
      else
855
473k
        
CHadd468k
(cs, i);
856
1.84k
    if (p->g->cflags&REG_NEWLINE)
857
1.84k
      
CHsub0
(cs, '\n');
858
1.84k
    if (cs->multis != NULL)
859
1.84k
      
mcinvert(p, cs)0
;
860
1.84k
  }
861
1.56M
862
1.56M
  assert(cs->multis == NULL);   /* xxx */
863
1.56M
864
1.56M
  if (nch(p, cs) == 1) {   /* optimize singleton sets */
865
1
    ordinary(p, firstch(p, cs));
866
1
    freeset(p, cs);
867
1
  } else
868
1.56M
    
EMIT1.56M
(OANYOF, freezeset(p, cs));
869
1.56M
}
870
871
/*
872
 - p_b_term - parse one term of a bracketed character list
873
 */
874
static void
875
p_b_term(struct parse *p, cset *cs)
876
4.72M
{
877
4.72M
  char c;
878
4.72M
  char start, finish;
879
4.72M
  int i;
880
4.72M
881
4.72M
  /* classify what we've got */
882
4.72M
  switch ((MORE()) ? PEEK() : 
'\0'0
) {
883
4.72M
  case '[':
884
517
    c = (MORE2()) ? PEEK2() : 
'\0'0
;
885
517
    break;
886
4.72M
  case '-':
887
1
    SETERROR(REG_ERANGE);
888
1
    return;     /* NOTE RETURN */
889
4.72M
    
break0
;
890
4.72M
  default:
891
4.72M
    c = '\0';
892
4.72M
    break;
893
4.72M
  }
894
4.72M
895
4.72M
  switch (c) {
896
4.72M
  case ':':   /* character class */
897
512
    NEXT2();
898
512
    REQUIRE(MORE(), REG_EBRACK);
899
512
    c = PEEK();
900
512
    REQUIRE(c != '-' && c != ']', REG_ECTYPE);
901
512
    p_b_cclass(p, cs);
902
512
    REQUIRE(MORE(), REG_EBRACK);
903
512
    REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
904
512
    break;
905
4.72M
  case '=':   /* equivalence class */
906
1
    NEXT2();
907
1
    REQUIRE(MORE(), REG_EBRACK);
908
1
    c = PEEK();
909
1
    REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
910
1
    p_b_eclass(p, cs);
911
1
    REQUIRE(MORE(), REG_EBRACK);
912
1
    REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
913
1
    break;
914
4.72M
  default:    /* symbol, ordinary character, or range */
915
4.72M
/* xxx revision needed for multichar stuff */
916
4.72M
    start = p_b_symbol(p);
917
4.72M
    if (SEE('-') && 
MORE2965k
() &&
PEEK2965k
() != ']'965k
) {
918
965k
      /* range */
919
965k
      NEXT();
920
965k
      if (EAT('-'))
921
965k
        
finish = '-'0
;
922
965k
      else
923
965k
        finish = p_b_symbol(p);
924
965k
    } else
925
3.76M
      finish = start;
926
4.72M
/* xxx what about signed chars here... */
927
4.72M
    REQUIRE(start <= finish, REG_ERANGE);
928
25.8M
    for (i = start; i <= finish; 
i++21.1M
)
929
21.1M
      CHadd(cs, i);
930
4.72M
    break;
931
4.72M
  }
932
4.72M
}
933
934
/*
935
 - p_b_cclass - parse a character-class name and deal with it
936
 */
937
static void
938
p_b_cclass(struct parse *p, cset *cs)
939
512
{
940
512
  char *sp = p->next;
941
512
  struct cclass *cp;
942
512
  size_t len;
943
512
  const char *u;
944
512
  char c;
945
512
946
3.07k
  while (MORE() && isalpha((uch)PEEK()))
947
2.56k
    NEXT();
948
512
  len = p->next - sp;
949
4.47k
  for (cp = cclasses; cp->name != NULL; 
cp++3.95k
)
950
4.47k
    if (strncmp(cp->name, sp, len) == 0 && 
cp->name[len] == '\0'512
)
951
512
      break;
952
512
  if (cp->name == NULL) {
953
0
    /* oops, didn't find it */
954
0
    SETERROR(REG_ECTYPE);
955
0
    return;
956
0
  }
957
512
958
512
  u = cp->chars;
959
3.95k
  while ((c = *u++) != '\0')
960
3.44k
    CHadd(cs, c);
961
512
  for (u = cp->multis; *u != '\0'; 
u += strlen(u) + 10
)
962
512
    
MCadd0
(p, cs, u);
963
512
}
964
965
/*
966
 - p_b_eclass - parse an equivalence-class name and deal with it
967
 *
968
 * This implementation is incomplete. xxx
969
 */
970
static void
971
p_b_eclass(struct parse *p, cset *cs)
972
1
{
973
1
  char c;
974
1
975
1
  c = p_b_coll_elem(p, '=');
976
1
  CHadd(cs, c);
977
1
}
978
979
/*
980
 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
981
 */
982
static char     /* value of symbol */
983
p_b_symbol(struct parse *p)
984
5.69M
{
985
5.69M
  char value;
986
5.69M
987
5.69M
  REQUIRE(MORE(), REG_EBRACK);
988
5.69M
  if (!EATTWO('[', '.'))
989
5.69M
    return(GETNEXT());
990
0
991
0
  /* collating symbol */
992
0
  value = p_b_coll_elem(p, '.');
993
0
  REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
994
0
  return(value);
995
0
}
996
997
/*
998
 - p_b_coll_elem - parse a collating-element name and look it up
999
 */
1000
static char     /* value of collating element */
1001
p_b_coll_elem(struct parse *p,
1002
    int endc)     /* name ended by endc,']' */
1003
1
{
1004
1
  char *sp = p->next;
1005
1
  struct cname *cp;
1006
1
  size_t len;
1007
1
1008
5
  while (MORE() && !SEETWO(endc, ']'))
1009
4
    NEXT();
1010
1
  if (!MORE()) {
1011
0
    SETERROR(REG_EBRACK);
1012
0
    return(0);
1013
0
  }
1014
1
  len = p->next - sp;
1015
96
  for (cp = cnames; cp->name != NULL; 
cp++95
)
1016
95
    if (strncmp(cp->name, sp, len) == 0 && 
strlen(cp->name) == len1
)
1017
0
      return(cp->code); /* known name */
1018
1
  if (len == 1)
1019
0
    return(*sp); /* single character */
1020
1
  SETERROR(REG_ECOLLATE);     /* neither */
1021
1
  return(0);
1022
1
}
1023
1024
/*
1025
 - othercase - return the case counterpart of an alphabetic
1026
 */
1027
static char     /* if no counterpart, return ch */
1028
othercase(int ch)
1029
132k
{
1030
132k
  ch = (uch)ch;
1031
132k
  assert(isalpha(ch));
1032
132k
  if (isupper(ch))
1033
48.1k
    return ((uch)tolower(ch));
1034
83.9k
  else if (islower(ch))
1035
83.9k
    return ((uch)toupper(ch));
1036
0
  else      /* peculiar, but could happen */
1037
0
    return(ch);
1038
132k
}
1039
1040
/*
1041
 - bothcases - emit a dualcase version of a two-case character
1042
 *
1043
 * Boy, is this implementation ever a kludge...
1044
 */
1045
static void
1046
bothcases(struct parse *p, int ch)
1047
36.3k
{
1048
36.3k
  char *oldnext = p->next;
1049
36.3k
  char *oldend = p->end;
1050
36.3k
  char bracket[3];
1051
36.3k
1052
36.3k
  ch = (uch)ch;
1053
36.3k
  assert(othercase(ch) != ch);  /* p_bracket() would recurse */
1054
36.3k
  p->next = bracket;
1055
36.3k
  p->end = bracket+2;
1056
36.3k
  bracket[0] = ch;
1057
36.3k
  bracket[1] = ']';
1058
36.3k
  bracket[2] = '\0';
1059
36.3k
  p_bracket(p);
1060
36.3k
  assert(p->next == bracket+2);
1061
36.3k
  p->next = oldnext;
1062
36.3k
  p->end = oldend;
1063
36.3k
}
1064
1065
/*
1066
 - ordinary - emit an ordinary character
1067
 */
1068
static void
1069
ordinary(struct parse *p, int ch)
1070
13.4M
{
1071
13.4M
  cat_t *cap = p->g->categories;
1072
13.4M
1073
13.4M
  if ((p->g->cflags&REG_ICASE) && 
isalpha((uch)ch)45.1k
&&
othercase(ch) != ch36.3k
)
1074
36.3k
    bothcases(p, ch);
1075
13.3M
  else {
1076
13.3M
    EMIT(OCHAR, (uch)ch);
1077
13.3M
    if (cap[ch] == 0)
1078
7.23M
      cap[ch] = p->g->ncategories++;
1079
13.3M
  }
1080
13.4M
}
1081
1082
/*
1083
 - nonnewline - emit REG_NEWLINE version of OANY
1084
 *
1085
 * Boy, is this implementation ever a kludge...
1086
 */
1087
static void
1088
nonnewline(struct parse *p)
1089
0
{
1090
0
  char *oldnext = p->next;
1091
0
  char *oldend = p->end;
1092
0
  char bracket[4];
1093
0
1094
0
  p->next = bracket;
1095
0
  p->end = bracket+3;
1096
0
  bracket[0] = '^';
1097
0
  bracket[1] = '\n';
1098
0
  bracket[2] = ']';
1099
0
  bracket[3] = '\0';
1100
0
  p_bracket(p);
1101
0
  assert(p->next == bracket+3);
1102
0
  p->next = oldnext;
1103
0
  p->end = oldend;
1104
0
}
1105
1106
/*
1107
 - repeat - generate code for a bounded repetition, recursively if needed
1108
 */
1109
static void
1110
repeat(struct parse *p,
1111
    sopno start,    /* operand from here to end of strip */
1112
    int from,     /* repeated from this number */
1113
    int to)     /* to this number of times (maybe INFINITY) */
1114
24
{
1115
24
  sopno finish = HERE();
1116
24
# define  N 2
1117
24
# define  INF 3
1118
48
# define  REP(f, t) ((
f24
)*8 + (
t36
))
1119
24
# define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1120
24
  sopno copy;
1121
24
1122
24
  if (p->error != 0)  /* head off possible runaway recursion */
1123
0
    return;
1124
24
1125
24
  assert(from <= to);
1126
24
1127
24
  switch (REP(MAP(from), MAP(to))) {
1128
24
  
case 0
REP0
(0, 0): /* must be user doing this */
1129
0
    DROP(finish-start); /* drop the operand */
1130
0
    break;
1131
24
  
case 0
REP0
(0, 1): /* as x{1,1}? */
1132
0
  case REP(0, N):     /* as x{1,n}? */
1133
0
  case REP(0, INF):   /* as x{1,}? */
1134
0
    /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1135
0
    INSERT(OCH_, start);    /* offset is wrong... */
1136
0
    repeat(p, start+1, 1, to);
1137
0
    ASTERN(OOR1, start);
1138
0
    AHEAD(start);      /* ... fix it */
1139
0
    EMIT(OOR2, 0);
1140
0
    AHEAD(THERE());
1141
0
    ASTERN(O_CH, THERETHERE());
1142
0
    break;
1143
12
  case REP(1, 1):     /* trivial case */
1144
12
    /* done */
1145
12
    break;
1146
12
  case REP(1, N):     /* as x?x{1,n-1} */
1147
12
    /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1148
12
    INSERT(OCH_, start);
1149
12
    ASTERN(OOR1, start);
1150
12
    AHEAD(start);
1151
12
    EMIT(OOR2, 0);      /* offset very wrong... */
1152
12
    AHEAD(THERE());      /* ...so fix it */
1153
12
    ASTERN(O_CH, THERETHERE());
1154
12
    copy = dupl(p, start+1, finish+1);
1155
12
    assert(copy == finish+4);
1156
12
    repeat(p, copy, 1, to-1);
1157
12
    break;
1158
0
  case REP(1, INF):   /* as x+ */
1159
0
    INSERT(OPLUS_, start);
1160
0
    ASTERN(O_PLUS, start);
1161
0
    break;
1162
0
  case REP(N, N):     /* as xx{m-1,n-1} */
1163
0
    copy = dupl(p, start, finish);
1164
0
    repeat(p, copy, from-1, to-1);
1165
0
    break;
1166
0
  case REP(N, INF):   /* as xx{n-1,INF} */
1167
0
    copy = dupl(p, start, finish);
1168
0
    repeat(p, copy, from-1, to);
1169
0
    break;
1170
0
  default:      /* "can't happen" */
1171
0
    SETERROR(REG_ASSERT); /* just in case */
1172
0
    break;
1173
24
  }
1174
24
}
1175
1176
/*
1177
 - seterr - set an error condition
1178
 */
1179
static int      /* useless but makes type checking happy */
1180
seterr(struct parse *p, int e)
1181
61.0k
{
1182
61.0k
  if (p->error == 0)  /* keep earliest error condition */
1183
61.0k
    p->error = e;
1184
61.0k
  p->next = nuls;   /* try to bring things to a halt */
1185
61.0k
  p->end = nuls;
1186
61.0k
  return(0);    /* make the return value well-defined */
1187
61.0k
}
1188
1189
/*
1190
 - allocset - allocate a set of characters for []
1191
 */
1192
static cset *
1193
allocset(struct parse *p)
1194
1.56M
{
1195
1.56M
  int no = p->g->ncsets++;
1196
1.56M
  size_t nc;
1197
1.56M
  size_t nbytes;
1198
1.56M
  cset *cs;
1199
1.56M
  size_t css = (size_t)p->g->csetsize;
1200
1.56M
  int i;
1201
1.56M
1202
1.56M
  if (no >= p->ncsalloc) { /* need another column of space */
1203
567k
    void *ptr;
1204
567k
1205
567k
    p->ncsalloc += CHAR_BIT;
1206
567k
    nc = p->ncsalloc;
1207
567k
    if (nc > SIZE_MAX / sizeof(cset))
1208
0
      goto nomem;
1209
567k
    assert(nc % CHAR_BIT == 0);
1210
567k
    nbytes = nc / CHAR_BIT * css;
1211
567k
1212
567k
    ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1213
567k
    if (ptr == NULL)
1214
567k
      
goto nomem0
;
1215
567k
    p->g->sets = ptr;
1216
567k
1217
567k
    ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1218
567k
    if (ptr == NULL)
1219
567k
      
goto nomem0
;
1220
567k
    p->g->setbits = ptr;
1221
567k
1222
576k
    for (i = 0; i < no; 
i++8.93k
)
1223
8.93k
      p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1224
567k
1225
567k
    (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1226
567k
  }
1227
1.56M
  /* XXX should not happen */
1228
1.56M
  if (p->g->sets == NULL || p->g->setbits == NULL)
1229
1.56M
    
goto nomem0
;
1230
1.56M
1231
1.56M
  cs = &p->g->sets[no];
1232
1.56M
  cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1233
1.56M
  cs->mask = 1 << ((no) % CHAR_BIT);
1234
1.56M
  cs->hash = 0;
1235
1.56M
  cs->smultis = 0;
1236
1.56M
  cs->multis = NULL;
1237
1.56M
1238
1.56M
  return(cs);
1239
0
nomem:
1240
0
  free(p->g->sets);
1241
0
  p->g->sets = NULL;
1242
0
  free(p->g->setbits);
1243
0
  p->g->setbits = NULL;
1244
0
1245
0
  SETERROR(REG_ESPACE);
1246
0
  /* caller's responsibility not to do set ops */
1247
0
  return(NULL);
1248
1.56M
}
1249
1250
/*
1251
 - freeset - free a now-unused set
1252
 */
1253
static void
1254
freeset(struct parse *p, cset *cs)
1255
20.2k
{
1256
20.2k
  size_t i;
1257
20.2k
  cset *top = &p->g->sets[p->g->ncsets];
1258
20.2k
  size_t css = (size_t)p->g->csetsize;
1259
20.2k
1260
5.20M
  for (i = 0; i < css; 
i++5.18M
)
1261
5.18M
    CHsub(cs, i);
1262
20.2k
  if (cs == top-1) /* recover only the easy case */
1263
20.2k
    p->g->ncsets--;
1264
20.2k
}
1265
1266
/*
1267
 - freezeset - final processing on a set of characters
1268
 *
1269
 * The main task here is merging identical sets.  This is usually a waste
1270
 * of time (although the hash code minimizes the overhead), but can win
1271
 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1272
 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1273
 * the same value!
1274
 */
1275
static int      /* set number */
1276
freezeset(struct parse *p, cset *cs)
1277
1.56M
{
1278
1.56M
  uch h = cs->hash;
1279
1.56M
  size_t i;
1280
1.56M
  cset *top = &p->g->sets[p->g->ncsets];
1281
1.56M
  cset *cs2;
1282
1.56M
  size_t css = (size_t)p->g->csetsize;
1283
1.56M
1284
1.56M
  /* look for an earlier one which is the same */
1285
4.71M
  for (cs2 = &p->g->sets[0]; cs2 < top; 
cs2++3.14M
)
1286
3.16M
    if (cs2->hash == h && 
cs2 != cs1.56M
) {
1287
20.2k
      /* maybe */
1288
5.20M
      for (i = 0; i < css; 
i++5.18M
)
1289
5.18M
        if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1290
5.18M
          
break2
; /* no */
1291
20.2k
      if (i == css)
1292
20.2k
        break;     /* yes */
1293
20.2k
    }
1294
1.56M
1295
1.56M
  if (cs2 < top) { /* found one */
1296
20.2k
    freeset(p, cs);
1297
20.2k
    cs = cs2;
1298
20.2k
  }
1299
1.56M
1300
1.56M
  return((int)(cs - p->g->sets));
1301
1.56M
}
1302
1303
/*
1304
 - firstch - return first character in a set (which must have at least one)
1305
 */
1306
static int      /* character; there is no "none" value */
1307
firstch(struct parse *p, cset *cs)
1308
1
{
1309
1
  size_t i;
1310
1
  size_t css = (size_t)p->g->csetsize;
1311
1
1312
42
  for (i = 0; i < css; 
i++41
)
1313
42
    if (CHIN(cs, i))
1314
42
      
return((char)i)1
;
1315
1
  assert(never);
1316
0
  return(0);   /* arbitrary */
1317
1
}
1318
1319
/*
1320
 - nch - number of characters in a set
1321
 */
1322
static int
1323
nch(struct parse *p, cset *cs)
1324
1.56M
{
1325
1.56M
  size_t i;
1326
1.56M
  size_t css = (size_t)p->g->csetsize;
1327
1.56M
  int n = 0;
1328
1.56M
1329
403M
  for (i = 0; i < css; 
i++401M
)
1330
401M
    if (CHIN(cs, i))
1331
401M
      
n++21.6M
;
1332
1.56M
  return(n);
1333
1.56M
}
1334
1335
/*
1336
 - mcadd - add a collating element to a cset
1337
 */
1338
static void
1339
mcadd( struct parse *p, cset *cs, const char *cp)
1340
0
{
1341
0
  size_t oldend = cs->smultis;
1342
0
  void *np;
1343
0
1344
0
  cs->smultis += strlen(cp) + 1;
1345
0
  np = realloc(cs->multis, cs->smultis);
1346
0
  if (np == NULL) {
1347
0
    if (cs->multis)
1348
0
      free(cs->multis);
1349
0
    cs->multis = NULL;
1350
0
    SETERROR(REG_ESPACE);
1351
0
    return;
1352
0
  }
1353
0
  cs->multis = np;
1354
0
1355
0
  llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1356
0
}
1357
1358
/*
1359
 - mcinvert - invert the list of collating elements in a cset
1360
 *
1361
 * This would have to know the set of possibilities.  Implementation
1362
 * is deferred.
1363
 */
1364
/* ARGSUSED */
1365
static void
1366
mcinvert(struct parse *p, cset *cs)
1367
0
{
1368
0
  assert(cs->multis == NULL); /* xxx */
1369
0
}
1370
1371
/*
1372
 - mccase - add case counterparts of the list of collating elements in a cset
1373
 *
1374
 * This would have to know the set of possibilities.  Implementation
1375
 * is deferred.
1376
 */
1377
/* ARGSUSED */
1378
static void
1379
mccase(struct parse *p, cset *cs)
1380
0
{
1381
0
  assert(cs->multis == NULL); /* xxx */
1382
0
}
1383
1384
/*
1385
 - isinsets - is this character in any sets?
1386
 */
1387
static int      /* predicate */
1388
isinsets(struct re_guts *g, int c)
1389
141M
{
1390
141M
  uch *col;
1391
141M
  int i;
1392
141M
  int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1393
141M
  unsigned uc = (uch)c;
1394
141M
1395
266M
  for (i = 0, col = g->setbits; i < ncols; 
i++, col += g->csetsize125M
)
1396
126M
    if (col[uc] != 0)
1397
1.54M
      return(1);
1398
141M
  
return(0)140M
;
1399
141M
}
1400
1401
/*
1402
 - samesets - are these two characters in exactly the same sets?
1403
 */
1404
static int      /* predicate */
1405
samesets(struct re_guts *g, int c1, int c2)
1406
82.5M
{
1407
82.5M
  uch *col;
1408
82.5M
  int i;
1409
82.5M
  int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1410
82.5M
  unsigned uc1 = (uch)c1;
1411
82.5M
  unsigned uc2 = (uch)c2;
1412
82.5M
1413
94.5M
  for (i = 0, col = g->setbits; i < ncols; 
i++, col += g->csetsize11.9M
)
1414
82.6M
    if (col[uc1] != col[uc2])
1415
70.6M
      return(0);
1416
82.5M
  
return(1)11.8M
;
1417
82.5M
}
1418
1419
/*
1420
 - categorize - sort out character categories
1421
 */
1422
static void
1423
categorize(struct parse *p, struct re_guts *g)
1424
689k
{
1425
689k
  cat_t *cats = g->categories;
1426
689k
  int c;
1427
689k
  int c2;
1428
689k
  cat_t cat;
1429
689k
1430
689k
  /* avoid making error situations worse */
1431
689k
  if (p->error != 0)
1432
61.0k
    return;
1433
628k
1434
161M
  
for (c = CHAR_MIN; 628k
c <= CHAR_MAX;
c++160M
)
1435
160M
    if (cats[c] == 0 && 
isinsets(g, c)141M
) {
1436
1.54M
      cat = g->ncategories++;
1437
1.54M
      cats[c] = cat;
1438
119M
      for (c2 = c+1; c2 <= CHAR_MAX; 
c2++118M
)
1439
118M
        if (cats[c2] == 0 && 
samesets(g, c, c2)82.5M
)
1440
11.8M
          cats[c2] = cat;
1441
1.54M
    }
1442
628k
}
1443
1444
/*
1445
 - dupl - emit a duplicate of a bunch of sops
1446
 */
1447
static sopno      /* start of duplicate */
1448
dupl(struct parse *p,
1449
    sopno start,    /* from here */
1450
    sopno finish)   /* to this less one */
1451
17
{
1452
17
  sopno ret = HERE();
1453
17
  sopno len = finish - start;
1454
17
1455
17
  assert(finish >= start);
1456
17
  if (len == 0)
1457
0
    return(ret);
1458
17
  enlarge(p, p->ssize + len); /* this many unexpected additions */
1459
17
  assert(p->ssize >= p->slen + len);
1460
17
  (void) memmove((char *)(p->strip + p->slen),
1461
17
    (char *)(p->strip + start), (size_t)len*sizeof(sop));
1462
17
  p->slen += len;
1463
17
  return(ret);
1464
17
}
1465
1466
/*
1467
 - doemit - emit a strip operator
1468
 *
1469
 * It might seem better to implement this as a macro with a function as
1470
 * hard-case backup, but it's just too big and messy unless there are
1471
 * some changes to the data structures.  Maybe later.
1472
 */
1473
static void
1474
doemit(struct parse *p, sop op, size_t opnd)
1475
23.6M
{
1476
23.6M
  /* avoid making error situations worse */
1477
23.6M
  if (p->error != 0)
1478
61.0k
    return;
1479
23.6M
1480
23.6M
  /* deal with oversize operands ("can't happen", more or less) */
1481
23.6M
  assert(opnd < 1<<OPSHIFT);
1482
23.6M
1483
23.6M
  /* deal with undersized strip */
1484
23.6M
  if (p->slen >= p->ssize)
1485
3.96k
    enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1486
23.6M
  assert(p->slen < p->ssize);
1487
23.6M
1488
23.6M
  /* finally, it's all reduced to the easy case */
1489
23.6M
  p->strip[p->slen++] = SOP(op, opnd);
1490
23.6M
}
1491
1492
/*
1493
 - doinsert - insert a sop into the strip
1494
 */
1495
static void
1496
doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1497
1.54M
{
1498
1.54M
  sopno sn;
1499
1.54M
  sop s;
1500
1.54M
  int i;
1501
1.54M
1502
1.54M
  /* avoid making error situations worse */
1503
1.54M
  if (p->error != 0)
1504
0
    return;
1505
1.54M
1506
1.54M
  sn = HERE();
1507
1.54M
  EMIT(op, opnd);   /* do checks, ensure space */
1508
1.54M
  assert(HERE() == sn+1);
1509
1.54M
  s = p->strip[sn];
1510
1.54M
1511
1.54M
  /* adjust paren pointers */
1512
1.54M
  assert(pos > 0);
1513
15.4M
  for (i = 1; i < NPAREN; 
i++13.9M
) {
1514
13.9M
    if (p->pbegin[i] >= pos) {
1515
6.22k
      p->pbegin[i]++;
1516
6.22k
    }
1517
13.9M
    if (p->pend[i] >= pos) {
1518
6.22k
      p->pend[i]++;
1519
6.22k
    }
1520
13.9M
  }
1521
1.54M
1522
1.54M
  memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1523
1.54M
            (HERE()-pos-1)*sizeof(sop));
1524
1.54M
  p->strip[pos] = s;
1525
1.54M
}
1526
1527
/*
1528
 - dofwd - complete a forward reference
1529
 */
1530
static void
1531
dofwd(struct parse *p, sopno pos, sop value)
1532
1.36M
{
1533
1.36M
  /* avoid making error situations worse */
1534
1.36M
  if (p->error != 0)
1535
0
    return;
1536
1.36M
1537
1.36M
  assert(value < 1<<OPSHIFT);
1538
1.36M
  p->strip[pos] = OP(p->strip[pos]) | value;
1539
1.36M
}
1540
1541
/*
1542
 - enlarge - enlarge the strip
1543
 */
1544
static void
1545
enlarge(struct parse *p, sopno size)
1546
3.98k
{
1547
3.98k
  sop *sp;
1548
3.98k
1549
3.98k
  if (p->ssize >= size)
1550
0
    return;
1551
3.98k
1552
3.98k
  if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
1553
0
    SETERROR(REG_ESPACE);
1554
0
    return;
1555
0
  }
1556
3.98k
1557
3.98k
  sp = (sop *)realloc(p->strip, size*sizeof(sop));
1558
3.98k
  if (sp == NULL) {
1559
0
    SETERROR(REG_ESPACE);
1560
0
    return;
1561
0
  }
1562
3.98k
  p->strip = sp;
1563
3.98k
  p->ssize = size;
1564
3.98k
}
1565
1566
/*
1567
 - stripsnug - compact the strip
1568
 */
1569
static void
1570
stripsnug(struct parse *p, struct re_guts *g)
1571
689k
{
1572
689k
  g->nstates = p->slen;
1573
689k
  if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
1574
0
    g->strip = p->strip;
1575
0
    SETERROR(REG_ESPACE);
1576
0
    return;
1577
0
  }
1578
689k
1579
689k
  g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1580
689k
  if (g->strip == NULL) {
1581
0
    SETERROR(REG_ESPACE);
1582
0
    g->strip = p->strip;
1583
0
  }
1584
689k
}
1585
1586
/*
1587
 - findmust - fill in must and mlen with longest mandatory literal string
1588
 *
1589
 * This algorithm could do fancy things like analyzing the operands of |
1590
 * for common subsequences.  Someday.  This code is simple and finds most
1591
 * of the interesting cases.
1592
 *
1593
 * Note that must and mlen got initialized during setup.
1594
 */
1595
static void
1596
findmust(struct parse *p, struct re_guts *g)
1597
689k
{
1598
689k
  sop *scan;
1599
689k
  sop *start = 0; /* start initialized in the default case, after that */
1600
689k
  sop *newstart = 0; /* newstart was initialized in the OCHAR case */
1601
689k
  sopno newlen;
1602
689k
  sop s;
1603
689k
  char *cp;
1604
689k
  sopno i;
1605
689k
1606
689k
  /* avoid making error situations worse */
1607
689k
  if (p->error != 0)
1608
61.0k
    return;
1609
628k
1610
628k
  /* find the longest OCHAR sequence in strip */
1611
628k
  newlen = 0;
1612
628k
  scan = g->strip + 1;
1613
13.4M
  do {
1614
13.4M
    s = *scan++;
1615
13.4M
    switch (OP(s)) {
1616
13.4M
    
case 7.95M
OCHAR7.95M
: /* sequence member */
1617
7.95M
      if (newlen == 0)    /* new sequence */
1618
1.23M
        newstart = scan - 1;
1619
7.95M
      newlen++;
1620
7.95M
      break;
1621
13.4M
    
case 1.39M
OPLUS_1.39M
: /* things that don't break one */
1622
1.39M
    case OLPAREN:
1623
1.39M
    case ORPAREN:
1624
1.39M
      break;
1625
1.39M
    
case 1.02M
OQUEST_1.02M
: /* things that must be skipped */
1626
1.02M
    case OCH_:
1627
1.02M
      scan--;
1628
1.82M
      do {
1629
1.82M
        scan += OPND(s);
1630
1.82M
        s = *scan;
1631
1.82M
        /* assert() interferes w debug printouts */
1632
1.82M
        if (OP(s) != O_QUEST && 
OP1.35M
(s) != 1.35M
O_CH1.35M
&&
1633
1.82M
              
OP802k
(s) != 802k
OOR2802k
) {
1634
0
          g->iflags |= REGEX_BAD;
1635
0
          return;
1636
0
        }
1637
1.82M
      } while (
OP1.02M
(s) != O_QUEST &&
OP1.35M
(s) != 1.35M
O_CH1.35M
);
1638
1.02M
      /* fallthrough */
1639
4.12M
    default:    /* things that break a sequence */
1640
4.12M
      if (newlen > g->mlen) {   /* ends one */
1641
619k
        start = newstart;
1642
619k
        g->mlen = newlen;
1643
619k
      }
1644
4.12M
      newlen = 0;
1645
4.12M
      break;
1646
13.4M
    }
1647
13.4M
  } while (
OP628k
(s) != OEND);
1648
628k
1649
628k
  if (g->mlen == 0)    /* there isn't one */
1650
10.7k
    return;
1651
617k
1652
617k
  /* turn it into a character string */
1653
617k
  g->must = malloc((size_t)g->mlen + 1);
1654
617k
  if (g->must == NULL) {   /* argh; just forget it */
1655
0
    g->mlen = 0;
1656
0
    return;
1657
0
  }
1658
617k
  cp = g->must;
1659
617k
  scan = start;
1660
7.22M
  for (i = g->mlen; i > 0; 
i--6.60M
) {
1661
6.68M
    while (OP(s = *scan++) != OCHAR)
1662
6.60M
      
continue77.4k
;
1663
6.60M
    assert(cp < g->must + g->mlen);
1664
6.60M
    *cp++ = (char)OPND(s);
1665
6.60M
  }
1666
617k
  assert(cp == g->must + g->mlen);
1667
617k
  *cp++ = '\0';   /* just on general principles */
1668
617k
}
1669
1670
/*
1671
 - pluscount - count + nesting
1672
 */
1673
static sopno      /* nesting depth */
1674
pluscount(struct parse *p, struct re_guts *g)
1675
689k
{
1676
689k
  sop *scan;
1677
689k
  sop s;
1678
689k
  sopno plusnest = 0;
1679
689k
  sopno maxnest = 0;
1680
689k
1681
689k
  if (p->error != 0)
1682
61.0k
    return(0); /* there may not be an OEND */
1683
628k
1684
628k
  scan = g->strip + 1;
1685
22.9M
  do {
1686
22.9M
    s = *scan++;
1687
22.9M
    switch (OP(s)) {
1688
22.9M
    
case 520k
OPLUS_520k
:
1689
520k
      plusnest++;
1690
520k
      break;
1691
22.9M
    
case 520k
O_PLUS520k
:
1692
520k
      if (plusnest > maxnest)
1693
489k
        maxnest = plusnest;
1694
520k
      plusnest--;
1695
520k
      break;
1696
22.9M
    }
1697
22.9M
  } while (
OP628k
(s) != OEND);
1698
628k
  if (plusnest != 0)
1699
0
    g->iflags |= REGEX_BAD;
1700
628k
  return(maxnest);
1701
628k
}