NEURON
regexp.cpp
Go to the documentation of this file.
1 #ifdef HAVE_CONFIG_H
2 #include <../../nrnconf.h>
3 #endif
4 /*
5  * Copyright (c) 1987, 1988, 1989, 1990, 1991 Stanford University
6  * Copyright (c) 1991 Silicon Graphics, Inc.
7  *
8  * Permission to use, copy, modify, distribute, and sell this software and
9  * its documentation for any purpose is hereby granted without fee, provided
10  * that (i) the above copyright notices and this permission notice appear in
11  * all copies of the software and related documentation, and (ii) the names of
12  * Stanford and Silicon Graphics may not be used in any advertising or
13  * publicity relating to the software without the specific, prior written
14  * permission of Stanford and Silicon Graphics.
15  *
16  * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
17  * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
18  * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
19  *
20  * IN NO EVENT SHALL STANFORD OR SILICON GRAPHICS BE LIABLE FOR
21  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
22  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
23  * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
24  * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
25  * OF THIS SOFTWARE.
26  */
27 
28 /*
29  * Regexp - regular expression searching
30  */
31 
32 #include <InterViews/regexp.h>
33 #include <ivstream.h>
34 #include <string.h>
35 
36 /*
37  * This version is based on the Henry Spencers public domain reimplementation
38  * of the regular expression matching subroutines. They are included as
39  * static subroutines after the externally accessible routines.
40  */
41 
42 /*
43  * Forward declarations for regcomp()'s friends.
44  */
45 static regexp* regcomp(const char* exp);
46 static char* reg(int paren, int* flagp);
47 static char* regbranch(int* flagp);
48 static char* regpiece(int* flagp);
49 static char* regatom(int* flagp);
50 static char* regnode(char op);
51 static char* regnext(char* p);
52 static void regc(char b);
53 static void reginsert(char op, char* opnd);
54 static void regtail(char* p, char* val);
55 static void regoptail(char* p, char* val);
56 static void regerror(const char* s);
57 static int regexec(regexp* prog, char* string);
58 static int regtry(regexp* prog, char* string);
59 static int regmatch(char* prog);
60 static int regrepeat(char* p);
61 
62 
63 inline char *
64 FindNewline(char* s) {
65  return strchr(s, '\n');
66 }
67 
68 inline char *
69 NextLine(char* s) {
70  char* newstart;
71 
72  if ((newstart = FindNewline(s)) != nil)
73  newstart++;
74  return newstart;
75 }
76 
77 Regexp::Regexp (const char* pat) {
78  int length = strlen(pat);
79  pattern_ = new char[length+1];
80  strncpy(pattern_, pat, length);
81  pattern_[length] = '\0';
83  if (!c_pattern) {
84  delete [] pattern_;
85  pattern_ = nil;
86  }
87 }
88 
89 Regexp::Regexp (const char* pat, int length) {
90  pattern_ = new char[length+1];
91  strncpy(pattern_, pat, length);
92  pattern_[length] = '\0';
94  if (!c_pattern) {
95  delete [] pattern_;
96  pattern_ = nil;
97  }
98 }
99 
101  if (pattern_) {
102  delete [] pattern_;
103  }
104  if (c_pattern) {
105  delete [] c_pattern;
106  }
107 }
108 
109 const char* Regexp::pattern() const { return pattern_; }
110 
111 int Regexp::Search (const char* text, int length, int index, int range) {
112  bool forwardSearch;
113  bool frontAnchored;
114  bool endAnchored;
115  char* searchStart;
116  char* searchLimit;
117  char* endOfLine = nil;
118  char* lastMatch = nil;
119  char csave;
120 
121  /*
122  * A small sanity check. Otherwise length is unused in this function.
123  * This is really what the logic embedded in the old version of this
124  * routine enforced.
125  */
126  if (index + range > length) {
127  range = length - index;
128  if (range < 0)
129  return -1;
130  }
131 
132  if (c_pattern == nil) {
133  return -1;
134  }
135 
136  c_pattern->startp[0] = nil;
137 
138  if (range < 0) {
139  forwardSearch = false;
140  searchLimit = (char *) text + index;
141  searchStart = (char *) searchLimit + range; /* range is negative */
142  } else {
143  forwardSearch = true;
144  searchStart = (char *) text + index;
145  searchLimit = (char *) searchStart + range;
146  }
147 
148  /* Mark end of text string so search will stop */
149  char save = *searchLimit;
150  *searchLimit = '\0';
151 
152  frontAnchored = pattern_[0] == '^';
153  endAnchored = pattern_[strlen(pattern_)-1] == '$';
154  if (frontAnchored && (searchStart != text || searchStart[-1] == '\n')) {
155  searchStart = NextLine(searchStart);
156  }
157 
158  while (searchStart && searchStart < searchLimit) {
159  int result;
160 
161  if (endAnchored && (endOfLine = FindNewline(searchStart)) != nil) {
162  csave = *endOfLine;
163  *endOfLine = '\0';
164  }
165 
166  result = regexec(c_pattern, searchStart);
167 
168  if (endOfLine)
169  *endOfLine = csave;
170 
171  if (result) {
172  /* Found a match */
173  if (forwardSearch)
174  break; /* Done */
175  else {
176  lastMatch = c_pattern->startp[0];
177  searchStart = c_pattern->endp[0];
178  if (frontAnchored)
179  searchStart = NextLine(searchStart);
180  continue;
181  }
182  }
183  /* Did not find a match */
184  if (frontAnchored || endAnchored)
185  searchStart = NextLine(searchStart);
186  else
187  break;
188  }
189 
190  if (!forwardSearch && lastMatch) {
191  if (endAnchored && (endOfLine = FindNewline(lastMatch)) != nil) {
192  csave = *endOfLine;
193  *endOfLine = '\0';
194  }
195  (void) regexec(c_pattern, lastMatch); /* Refill startp and endp */
196  if (endOfLine)
197  *endOfLine = csave;
198  }
199 
200  *searchLimit = save;
201  c_pattern->textStart = (char *) text;
202 
203  return c_pattern->startp[0] - c_pattern->textStart;
204 }
205 
206 int Regexp::Match (const char* text, int length, int index) {
207 
208  if (c_pattern == nil)
209  return -1;
210 
211  c_pattern->startp[0] = nil;
212 
213  char save = *(text+length);
214  *(char*)(text+length) = '\0';
215 
216  c_pattern->textStart = (char *) text;
217  (void) regexec(c_pattern, (char *) text + index);
218 
219  *(char*)(text+length) = save;
220 
221  if (c_pattern->startp[0] != nil)
222  return c_pattern->endp[0] - c_pattern->startp[0];
223  else
224  return -1;
225 }
226 
227 int Regexp::BeginningOfMatch (int subexp) {
228  if (subexp < 0 || subexp > NSUBEXP ||
229  c_pattern == nil || c_pattern->startp[0] == nil)
230  return -1;
231  return c_pattern->startp[subexp] - c_pattern->textStart;
232 }
233 
234 int Regexp::EndOfMatch (int subexp) {
235  if (subexp < 0 || subexp > NSUBEXP ||
236  c_pattern == nil || c_pattern->startp[0] == nil)
237  return -1;
238  return c_pattern->endp[subexp] - c_pattern->textStart;
239 }
240 
241 /*
242  * regcomp and regexec
243  *
244  * Copyright (c) 1986 by University of Toronto.
245  * Written by Henry Spencer. Not derived from licensed software.
246  *
247  * Permission is granted to anyone to use this software for any
248  * purpose on any computer system, and to redistribute it freely,
249  * subject to the following restrictions:
250  *
251  * 1. The author is not responsible for the consequences of use of
252  * this software, no matter how awful, even if they arise
253  * from defects in it.
254  *
255  * 2. The origin of this software must not be misrepresented, either
256  * by explicit claim or by omission.
257  *
258  * 3. Altered versions must be plainly marked as such, and must not
259  * be misrepresented as being the original software.
260  *
261  * Beware that some of this code is subtly aware of the way operator
262  * precedence is structured in regular expressions. Serious changes in
263  * regular-expression syntax might require a total rethink.
264  */
265 
266 /*
267  * The "internal use only" fields in regexp.h are present to pass info from
268  * compile to execute that permits the execute phase to run lots faster on
269  * simple cases. They are:
270  *
271  * regstart char that must begin a match; '\0' if none obvious
272  * reganch is the match anchored (at beginning-of-line only)?
273  * regmust string (pointer into program) that match must include, or nil
274  * regmlen length of regmust string
275  *
276  * Regstart and reganch permit very fast decisions on suitable starting points
277  * for a match, cutting down the work a lot. Regmust permits fast rejection
278  * of lines that cannot possibly match. The regmust tests are costly enough
279  * that regcomp() supplies a regmust only if the r.e. contains something
280  * potentially expensive (at present, the only such thing detected is * or +
281  * at the start of the r.e., which can involve a lot of backup). Regmlen is
282  * supplied because the test in regexec() needs it and regcomp() is computing
283  * it anyway.
284  */
285 
286 /*
287  * Structure for regexp "program". This is essentially a linear encoding
288  * of a nondeterministic finite-state machine (aka syntax charts or
289  * "railroad normal form" in parsing technology). Each node is an opcode
290  * plus a "next" pointer, possibly plus an operand. "Next" pointers of
291  * all nodes except BRANCH implement concatenation; a "next" pointer with
292  * a BRANCH on both ends of it is connecting two alternatives. (Here we
293  * have one of the subtle syntax dependencies: an individual BRANCH (as
294  * opposed to a collection of them) is never concatenated with anything
295  * because of operator precedence.) The operand of some types of node is
296  * a literal string; for others, it is a node leading into a sub-FSM. In
297  * particular, the operand of a BRANCH node is the first node of the branch.
298  * (NB this is *not* a tree structure: the tail of the branch connects
299  * to the thing following the set of BRANCHes.) The opcodes are:
300  */
301 
302 /* definition number opnd? meaning */
303 #define END 0 /* no End of program. */
304 #define BOL 1 /* no Match "" at beginning of line. */
305 #define EOL 2 /* no Match "" at end of line. */
306 #define ANY 3 /* no Match any one character. */
307 #define ANYOF 4 /* str Match any character in this string. */
308 #define ANYBUT 5 /* str Match any character not in this string. */
309 #define BRANCH 6 /* node Match this alternative, or the next... */
310 #define BACK 7 /* no Match "", "next" ptr points backward. */
311 #define EXACTLY 8 /* str Match this string. */
312 #define NOTHING 9 /* no Match empty string. */
313 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
314 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
315 #define OPEN 20 /* no Mark this point in input as start of #n. */
316  /* OPEN+1 is number 1, etc. */
317 #define CLOSE 30 /* no Analogous to OPEN. */
318 
319 /*
320  * Opcode notes:
321  *
322  * BRANCH The set of branches constituting a single choice are hooked
323  * together with their "next" pointers, since precedence prevents
324  * anything being concatenated to any individual branch. The
325  * "next" pointer of the last BRANCH in a choice points to the
326  * thing following the whole choice. This is also where the
327  * final "next" pointer of each individual branch points; each
328  * branch starts with the operand node of a BRANCH node.
329  *
330  * BACK Normal "next" pointers all implicitly point forward; BACK
331  * exists to make loop structures possible.
332  *
333  * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
334  * BRANCH structures using BACK. Simple cases (one character
335  * per match) are implemented with STAR and PLUS for speed
336  * and to minimize recursive plunges.
337  *
338  * OPEN,CLOSE ...are numbered at compile time.
339  */
340 
341 /*
342  * A node is one char of opcode followed by two chars of "next" pointer.
343  * "Next" pointers are stored as two 8-bit pieces, high order first. The
344  * value is a positive offset from the opcode of the node containing it.
345  * An operand, if any, simply follows the node. (Note that much of the
346  * code generation knows about this implicit relationship.)
347  *
348  * Using two bytes for the "next" pointer is vast overkill for most things,
349  * but allows patterns to get big without disasters.
350  */
351 #define OP(p) (*(p))
352 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
353 #define OPERAND(p) ((p) + 3)
354 
355 /*
356  * Utility definitions.
357  */
358 #ifndef RE_CHARBITS
359 #define RE_CHARBITS 0xff
360 #endif
361 
362 #define UCHARAT(p) ((int)*(p)&RE_CHARBITS)
363 
364 #define FAIL(m) { regerror(m); return(nil); }
365 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
366 #define META "^$.[()|?+*\\"
367 
368 /*
369  * Flags to be passed up and down.
370  */
371 #define HASWIDTH 01 /* Known never to match null string. */
372 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
373 #define SPSTART 04 /* Starts with * or +. */
374 #define WORST 0 /* Worst case. */
375 
376 /*
377  * Global work variables for regcomp().
378  */
379 static const char *regparse; /* Input-scan pointer. */
380 static int regnpar; /* () count. */
381 static char regdummy;
382 static char *regcode; /* Code-emit pointer; &regdummy = don't. */
383 static long regsize; /* Code size. */
384 
385 /*
386  - regcomp - compile a regular expression into internal code
387  *
388  * We can't allocate space until we know how big the compiled form will be,
389  * but we can't compile it (and thus know how big it is) until we've got a
390  * place to put the code. So we cheat: we compile it twice, once with code
391  * generation turned off and size counting turned on, and once "for real".
392  * This also means that we don't allocate space until we are sure that the
393  * thing really will compile successfully, and we never have to move the
394  * code and thus invalidate pointers into it. (Note that it has to be in
395  * one piece because free() must be able to free it all.)
396  *
397  * Beware that the optimization-preparation code in here knows about some
398  * of the structure of the compiled regexp.
399  */
400 static regexp *
401 regcomp(const char* exp) {
402  regexp *r;
403  char *scan;
404  char *longest;
405  int len;
406  int flags;
407 
408  if (exp == nil)
409  FAIL("nil argument");
410 
411  /* First pass: determine size, legality. */
412  regparse = exp;
413  regnpar = 1;
414  regsize = 0L;
415  regcode = &regdummy;
417  if (reg(0, &flags) == nil)
418  return(nil);
419 
420  /* Small enough for pointer-storage convention? */
421  if (regsize >= 32767L) /* Probably could be 65535L. */
422  FAIL("regexp too big");
423 
424  /* Allocate space. */
425  r = (regexp *) new char[sizeof(regexp) + (unsigned)regsize];
426 
427  /* Second pass: emit code. */
428  regparse = exp;
429  regnpar = 1;
430  regcode = r->program;
432  if (reg(0, &flags) == nil) {
433  delete [] r;
434  return(nil);
435  }
436 
437  /* Dig out information for optimizations. */
438  r->regstart = '\0'; /* Worst-case defaults. */
439  r->reganch = 0;
440  r->regmust = nil;
441  r->regmlen = 0;
442  scan = r->program+1; /* First BRANCH. */
443  if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
444  scan = OPERAND(scan);
445 
446  /* Starting-point info. */
447  if (OP(scan) == EXACTLY)
448  r->regstart = *OPERAND(scan);
449  else if (OP(scan) == BOL)
450  r->reganch++;
451 
452  /*
453  * If there's something expensive in the r.e., find the
454  * longest literal string that must appear and make it the
455  * regmust. Resolve ties in favor of later strings, since
456  * the regstart check works with the beginning of the r.e.
457  * and avoiding duplication strengthens checking. Not a
458  * strong reason, but sufficient in the absence of others.
459  */
460  if (flags&SPSTART) {
461  longest = nil;
462  len = 0;
463  for (; scan != nil; scan = regnext(scan))
464  if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
465  longest = OPERAND(scan);
466  len = strlen(OPERAND(scan));
467  }
468  r->regmust = longest;
469  r->regmlen = len;
470  }
471  }
472 
473  return(r);
474 }
475 
476 /*
477  - reg - regular expression, i.e. main body or parenthesized thing
478  *
479  * Caller must absorb opening parenthesis.
480  *
481  * Combining parenthesis handling with the base level of regular expression
482  * is a trifle forced, but the need to tie the tails of the branches to what
483  * follows makes it hard to avoid.
484  */
485 static char *
486 reg(int paren, int* flagp) {
487  char *ret;
488  char *br;
489  char *ender;
490  int parno;
491  int flags;
492 
493  *flagp = HASWIDTH; /* Tentatively. */
494 
495  /* Make an OPEN node, if parenthesized. */
496  if (paren) {
497  if (regnpar >= NSUBEXP)
498  FAIL("too many ()");
499  parno = regnpar;
500  regnpar++;
501  ret = regnode(OPEN+parno);
502  } else
503  ret = nil;
504 
505  /* Pick up the branches, linking them together. */
506  br = regbranch(&flags);
507  if (br == nil)
508  return(nil);
509  if (ret != nil)
510  regtail(ret, br); /* OPEN -> first. */
511  else
512  ret = br;
513  if (!(flags&HASWIDTH))
514  *flagp &= ~HASWIDTH;
515  *flagp |= flags&SPSTART;
516  while (*regparse == '|') {
517  regparse++;
518  br = regbranch(&flags);
519  if (br == nil)
520  return(nil);
521  regtail(ret, br); /* BRANCH -> BRANCH. */
522  if (!(flags&HASWIDTH))
523  *flagp &= ~HASWIDTH;
524  *flagp |= flags&SPSTART;
525  }
526 
527  /* Make a closing node, and hook it on the end. */
528  ender = regnode((paren) ? CLOSE+parno : END);
529  regtail(ret, ender);
530 
531  /* Hook the tails of the branches to the closing node. */
532  for (br = ret; br != nil; br = regnext(br))
533  regoptail(br, ender);
534 
535  /* Check for proper termination. */
536  if (paren && *regparse++ != ')') {
537  FAIL("unmatched ()");
538  } else if (!paren && *regparse != '\0') {
539  if (*regparse == ')') {
540  FAIL("unmatched ()");
541  } else
542  FAIL("junk on end"); /* "Can't happen". */
543  /* NOTREACHED */
544  }
545 
546  return(ret);
547 }
548 
549 /*
550  - regbranch - one alternative of an | operator
551  *
552  * Implements the concatenation operator.
553  */
554 static char *
555 regbranch(int* flagp) {
556  char *ret;
557  char *chain;
558  char *latest;
559  int flags;
560 
561  *flagp = WORST; /* Tentatively. */
562 
563  ret = regnode(BRANCH);
564  chain = nil;
565  while (*regparse != '\0' && *regparse != '|') {
566  if (*regparse == '\\' && regparse[1] == ')') {
567  regparse++;
568  break;
569  }
570  latest = regpiece(&flags);
571  if (latest == nil)
572  return(nil);
573  *flagp |= flags&HASWIDTH;
574  if (chain == nil) /* First piece. */
575  *flagp |= flags&SPSTART;
576  else
577  regtail(chain, latest);
578  chain = latest;
579  }
580  if (chain == nil) /* Loop ran zero times. */
581  (void) regnode(NOTHING);
582 
583  return(ret);
584 }
585 
586 /*
587  - regpiece - something followed by possible [*+?]
588  *
589  * Note that the branching code sequences used for ? and the general cases
590  * of * and + are somewhat optimized: they use the same NOTHING node as
591  * both the endmarker for their branch list and the body of the last branch.
592  * It might seem that this node could be dispensed with entirely, but the
593  * endmarker role is not redundant.
594  */
595 static char *
596 regpiece(int* flagp) {
597  char *ret;
598  char op;
599  char *next;
600  int flags;
601 
602  ret = regatom(&flags);
603  if (ret == nil)
604  return(nil);
605 
606  op = *regparse;
607  if (!ISMULT(op)) {
608  *flagp = flags;
609  return(ret);
610  }
611 
612  if (!(flags&HASWIDTH) && op != '?')
613  FAIL("*+ operand could be empty");
614  *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
615 
616  if (op == '*' && (flags&SIMPLE))
617  reginsert(STAR, ret);
618  else if (op == '*') {
619  /* Emit x* as (x&|), where & means "self". */
620  reginsert(BRANCH, ret); /* Either x */
621  regoptail(ret, regnode(BACK)); /* and loop */
622  regoptail(ret, ret); /* back */
623  regtail(ret, regnode(BRANCH)); /* or */
624  regtail(ret, regnode(NOTHING)); /* null. */
625  } else if (op == '+' && (flags&SIMPLE))
626  reginsert(PLUS, ret);
627  else if (op == '+') {
628  /* Emit x+ as x(&|), where & means "self". */
629  next = regnode(BRANCH); /* Either */
630  regtail(ret, next);
631  regtail(regnode(BACK), ret); /* loop back */
632  regtail(next, regnode(BRANCH)); /* or */
633  regtail(ret, regnode(NOTHING)); /* null. */
634  } else if (op == '?') {
635  /* Emit x? as (x|) */
636  reginsert(BRANCH, ret); /* Either x */
637  regtail(ret, regnode(BRANCH)); /* or */
638  next = regnode(NOTHING); /* null. */
639  regtail(ret, next);
640  regoptail(ret, next);
641  }
642  regparse++;
643  if (ISMULT(*regparse))
644  FAIL("nested *?+");
645 
646  return(ret);
647 }
648 
649 /*
650  - regatom - the lowest level
651  *
652  * Optimization: gobbles an entire sequence of ordinary characters so that
653  * it can turn them into a single node, which is smaller to store and
654  * faster to run. Backslashed characters are exceptions, each becoming a
655  * separate node; the code is simpler that way and it's not worth fixing.
656  */
657 static char *
658 regatom(int* flagp) {
659  char *ret;
660  int flags;
661 
662  *flagp = WORST; /* Tentatively. */
663 
664  switch (*regparse++) {
665  case '^':
666  ret = regnode(BOL);
667  break;
668  case '$':
669  ret = regnode(EOL);
670  break;
671  case '.':
672  ret = regnode(ANY);
673  *flagp |= HASWIDTH|SIMPLE;
674  break;
675  case '[': {
676  int classbeg;
677  int classend;
678 
679  if (*regparse == '^') { /* Complement of range. */
680  ret = regnode(ANYBUT);
681  regparse++;
682  } else
683  ret = regnode(ANYOF);
684  if (*regparse == ']' || *regparse == '-')
685  regc(*regparse++);
686  while (*regparse != '\0' && *regparse != ']') {
687  if (*regparse == '-') {
688  regparse++;
689  if (*regparse == ']' || *regparse == '\0')
690  regc('-');
691  else {
692  classbeg = UCHARAT(regparse-2)+1;
693  classend = UCHARAT(regparse);
694  if (classbeg > classend+1)
695  FAIL("invalid [] range");
696  for (; classbeg <= classend; classbeg++)
697  regc(classbeg);
698  regparse++;
699  }
700  } else
701  regc(*regparse++);
702  }
703  regc('\0');
704  if (*regparse != ']')
705  FAIL("unmatched []");
706  regparse++;
707  *flagp |= HASWIDTH|SIMPLE;
708  }
709  break;
710  case '\0':
711  case '|':
712  FAIL("internal urp"); /* Supposed to be caught earlier. */
713  break;
714  case '?':
715  case '+':
716  case '*':
717  FAIL("?+* follows nothing");
718  break;
719  case '\\':
720  if (*regparse == '\0')
721  FAIL("trailing \\");
722  if (*regparse == '(') {
723  regparse++;
724  ret = reg(1, &flags);
725  if (ret == nil)
726  return(nil);
727  *flagp |= flags&(HASWIDTH|SPSTART);
728  } else {
729  ret = regnode(EXACTLY);
730  regc(*regparse++);
731  regc('\0');
732  *flagp |= HASWIDTH|SIMPLE;
733  }
734  break;
735  default: {
736  int len;
737  char ender;
738 
739  regparse--;
740  len = strcspn(regparse, META);
741  if (len <= 0)
742  FAIL("internal disaster");
743  ender = *(regparse+len);
744  if (len > 1 && ISMULT(ender))
745  len--; /* Back off clear of ?+* operand. */
746  *flagp |= HASWIDTH;
747  if (len == 1)
748  *flagp |= SIMPLE;
749  ret = regnode(EXACTLY);
750  while (len > 0) {
751  regc(*regparse++);
752  len--;
753  }
754  regc('\0');
755  }
756  break;
757  }
758 
759  return(ret);
760 }
761 
762 /*
763  - regnode - emit a node
764  */
765 static char * /* Location. */
766 regnode(char op) {
767  char *ret;
768  char *ptr;
769 
770  ret = regcode;
771  if (ret == &regdummy) {
772  regsize += 3;
773  return(ret);
774  }
775 
776  ptr = ret;
777  *ptr++ = op;
778  *ptr++ = '\0'; /* Null "next" pointer. */
779  *ptr++ = '\0';
780  regcode = ptr;
781 
782  return(ret);
783 }
784 
785 /*
786  - regc - emit (if appropriate) a byte of code
787  */
788 static void
789 regc(char b) {
790  if (regcode != &regdummy)
791  *regcode++ = b;
792  else
793  regsize++;
794 }
795 
796 /*
797  - reginsert - insert an operator in front of already-emitted operand
798  *
799  * Means relocating the operand.
800  */
801 static void
802 reginsert(char op, char* opnd) {
803  char *src;
804  char *dst;
805  char *place;
806 
807  if (regcode == &regdummy) {
808  regsize += 3;
809  return;
810  }
811 
812  src = regcode;
813  regcode += 3;
814  dst = regcode;
815  while (src > opnd)
816  *--dst = *--src;
817 
818  place = opnd; /* Op node, where operand used to be. */
819  *place++ = op;
820  *place++ = '\0';
821  *place++ = '\0';
822 }
823 
824 /*
825  - regtail - set the next-pointer at the end of a node chain
826  */
827 static void
828 regtail(char* p, char* val) {
829  char *scan;
830  char *temp;
831  int offset;
832 
833  if (p == &regdummy)
834  return;
835 
836  /* Find last node. */
837  scan = p;
838  for (;;) {
839  temp = regnext(scan);
840  if (temp == nil)
841  break;
842  scan = temp;
843  }
844 
845  if (OP(scan) == BACK)
846  offset = scan - val;
847  else
848  offset = val - scan;
849  *(scan+1) = (offset>>8)&0377;
850  *(scan+2) = offset&0377;
851 }
852 
853 /*
854  - regoptail - regtail on operand of first argument; nop if operandless
855  */
856 static void
857 regoptail(char* p, char* val) {
858  /* "Operandless" and "op != BRANCH" are synonymous in practice. */
859  if (p == nil || p == &regdummy || OP(p) != BRANCH)
860  return;
861  regtail(OPERAND(p), val);
862 }
863 
864 /*
865  * regexec and friends
866  */
867 
868 /*
869  * Global work variables for regexec().
870  */
871 static char *reginput; /* String-input pointer. */
872 static char *regbol; /* Beginning of input, for ^ check. */
873 static char **regstartp; /* Pointer to startp array. */
874 static char **regendp; /* Ditto for endp. */
875 
876 /*
877  - regexec - match a regexp against a string
878  */
879 static int
880 regexec(regexp* prog, char* string) {
881  char *s;
882 
883  /* Be paranoid... */
884  if (prog == nil || string == nil) {
885  regerror("nil parameter");
886  return(0);
887  }
888 
889  /* Check validity of program. */
890  if (UCHARAT(prog->program) != REGEXP_MAGIC) {
891  regerror("corrupted program");
892  return(0);
893  }
894 
895  /* If there is a "must appear" string, look for it. */
896  if (prog->regmust != nil) {
897  s = string;
898  while ((s = strchr(s, prog->regmust[0])) != nil) {
899  if (strncmp(s, prog->regmust, prog->regmlen) == 0)
900  break; /* Found it. */
901  s++;
902  }
903  if (s == nil) /* Not present. */
904  return(0);
905  }
906 
907  /* Mark beginning of line for ^ . */
908  regbol = string;
909 
910  /* Simplest case: anchored match need be tried only once. */
911  if (prog->reganch)
912  return(regtry(prog, string));
913 
914  /* Messy cases: unanchored match. */
915  s = string;
916  if (prog->regstart != '\0')
917  /* We know what char it must start with. */
918  while ((s = strchr(s, prog->regstart)) != nil) {
919  if (regtry(prog, s))
920  return(1);
921  s++;
922  }
923  else
924  /* We don't -- general case. */
925  do {
926  if (regtry(prog, s))
927  return(1);
928  } while (*s++ != '\0');
929 
930  /* Failure. */
931  return(0);
932 }
933 
934 /*
935  - regtry - try match at specific point
936  */
937 static int /* 0 failure, 1 success */
938 regtry(regexp* prog, char* string) {
939  int i;
940  char **sp;
941  char **ep;
942 
943  reginput = string;
944  regstartp = prog->startp;
945  regendp = prog->endp;
946 
947  sp = prog->startp;
948  ep = prog->endp;
949  for (i = NSUBEXP; i > 0; i--) {
950  *sp++ = nil;
951  *ep++ = nil;
952  }
953  if (regmatch(prog->program + 1)) {
954  prog->startp[0] = string;
955  prog->endp[0] = reginput;
956  return(1);
957  } else
958  return(0);
959 }
960 
961 /*
962  - regmatch - main matching routine
963  *
964  * Conceptually the strategy is simple: check to see whether the current
965  * node matches, call self recursively to see whether the rest matches,
966  * and then act accordingly. In practice we make some effort to avoid
967  * recursion, in particular by going through "ordinary" nodes (that don't
968  * need to know whether the rest of the match failed) by a loop instead of
969  * by recursion.
970  */
971 static int /* 0 failure, 1 success */
972 regmatch(char* prog) {
973  char *scan; /* Current node. */
974  char *next; /* Next node. */
975 
976  scan = prog;
977  while (scan != nil) {
978  next = regnext(scan);
979 
980  switch (OP(scan)) {
981  case BOL:
982  if (reginput != regbol)
983  return(0);
984  break;
985  case EOL:
986  if (*reginput != '\0')
987  return(0);
988  break;
989  case ANY:
990  if (*reginput == '\0')
991  return(0);
992  reginput++;
993  break;
994  case EXACTLY: {
995  int len;
996  char *opnd;
997 
998  opnd = OPERAND(scan);
999  /* Inline the first character, for speed. */
1000  if (*opnd != *reginput)
1001  return(0);
1002  len = strlen(opnd);
1003  if (len > 1 && strncmp(opnd, reginput, len) != 0)
1004  return(0);
1005  reginput += len;
1006  }
1007  break;
1008  case ANYOF:
1009  if (*reginput == '\0')
1010  return(0);
1011  if (strchr(OPERAND(scan), *reginput) == nil)
1012  return(0);
1013  reginput++;
1014  break;
1015  case ANYBUT:
1016  if (*reginput == '\0')
1017  return(0);
1018  if (strchr(OPERAND(scan), *reginput) != nil)
1019  return(0);
1020  reginput++;
1021  break;
1022  case NOTHING:
1023  break;
1024  case BACK:
1025  break;
1026  case OPEN+1:
1027  case OPEN+2:
1028  case OPEN+3:
1029  case OPEN+4:
1030  case OPEN+5:
1031  case OPEN+6:
1032  case OPEN+7:
1033  case OPEN+8:
1034  case OPEN+9: {
1035  int no;
1036  char *save;
1037 
1038  no = OP(scan) - OPEN;
1039  save = reginput;
1040 
1041  if (regmatch(next)) {
1042  /*
1043  * Don't set startp if some later
1044  * invocation of the same parentheses
1045  * already has.
1046  */
1047  if (regstartp[no] == nil)
1048  regstartp[no] = save;
1049  return(1);
1050  } else
1051  return(0);
1052  }
1053  break;
1054  case CLOSE+1:
1055  case CLOSE+2:
1056  case CLOSE+3:
1057  case CLOSE+4:
1058  case CLOSE+5:
1059  case CLOSE+6:
1060  case CLOSE+7:
1061  case CLOSE+8:
1062  case CLOSE+9: {
1063  int no;
1064  char *save;
1065 
1066  no = OP(scan) - CLOSE;
1067  save = reginput;
1068 
1069  if (regmatch(next)) {
1070  /*
1071  * Don't set endp if some later
1072  * invocation of the same parentheses
1073  * already has.
1074  */
1075  if (regendp[no] == nil)
1076  regendp[no] = save;
1077  return(1);
1078  } else
1079  return(0);
1080  }
1081  break;
1082  case BRANCH: {
1083  char *save;
1084 
1085  if (OP(next) != BRANCH) /* No choice. */
1086  next = OPERAND(scan); /* Avoid recursion. */
1087  else {
1088  do {
1089  save = reginput;
1090  if (regmatch(OPERAND(scan)))
1091  return(1);
1092  reginput = save;
1093  scan = regnext(scan);
1094  } while (scan != nil && OP(scan) == BRANCH);
1095  return(0);
1096  /* NOTREACHED */
1097  }
1098  }
1099  break;
1100  case STAR:
1101  case PLUS: {
1102  char nextch;
1103  int no;
1104  char *save;
1105  int min;
1106 
1107  /*
1108  * Lookahead to avoid useless match attempts
1109  * when we know what character comes next.
1110  */
1111  nextch = '\0';
1112  if (OP(next) == EXACTLY)
1113  nextch = *OPERAND(next);
1114  min = (OP(scan) == STAR) ? 0 : 1;
1115  save = reginput;
1116  no = regrepeat(OPERAND(scan));
1117  while (no >= min) {
1118  /* If it could work, try it. */
1119  if (nextch == '\0' || *reginput == nextch)
1120  if (regmatch(next))
1121  return(1);
1122  /* Couldn't or didn't -- back up. */
1123  no--;
1124  reginput = save + no;
1125  }
1126  return(0);
1127  }
1128  break;
1129  case END:
1130  return(1); /* Success! */
1131  default:
1132  regerror("memory corruption");
1133  return(0);
1134  }
1135 
1136  scan = next;
1137  }
1138 
1139  /*
1140  * We get here only if there's trouble -- normally "case END" is
1141  * the terminating point.
1142  */
1143  regerror("corrupted pointers");
1144  return(0);
1145 }
1146 
1147 /*
1148  - regrepeat - repeatedly match something simple, report how many
1149  */
1150 static int
1151 regrepeat(char* p) {
1152  int count = 0;
1153  char *scan;
1154  char *opnd;
1155 
1156  scan = reginput;
1157  opnd = OPERAND(p);
1158  switch (OP(p)) {
1159  case ANY:
1160  count = strlen(scan);
1161  scan += count;
1162  break;
1163  case EXACTLY:
1164  while (*opnd == *scan) {
1165  count++;
1166  scan++;
1167  }
1168  break;
1169  case ANYOF:
1170  while (*scan != '\0' && strchr(opnd, *scan) != nil) {
1171  count++;
1172  scan++;
1173  }
1174  break;
1175  case ANYBUT:
1176  while (*scan != '\0' && strchr(opnd, *scan) == nil) {
1177  count++;
1178  scan++;
1179  }
1180  break;
1181  default: /* Oh dear. Called inappropriately. */
1182  regerror("internal foulup");
1183  count = 0; /* Best compromise. */
1184  break;
1185  }
1186  reginput = scan;
1187 
1188  return(count);
1189 }
1190 
1191 /*
1192  - regnext - dig the "next" pointer out of a node
1193  */
1194 static char *
1195 regnext(char* p) {
1196  int offset;
1197 
1198  if (p == &regdummy)
1199  return(nil);
1200 
1201  offset = NEXT(p);
1202  if (offset == 0)
1203  return(nil);
1204 
1205  if (OP(p) == BACK)
1206  return(p-offset);
1207  else
1208  return(p+offset);
1209 }
1210 
1211 static void
1212 regerror(const char* s) {
1213  std::cerr << "regexp: " << s << "\n";
1214 }
1215 
#define CLOSE
Definition: regexp.cpp:317
static char * regcode
Definition: regexp.cpp:382
char * regmust
Definition: regexp.h:50
static char regdummy
Definition: regexp.cpp:381
#define BACK
Definition: regexp.cpp:310
Regexp(const char *)
Definition: regexp.cpp:77
#define BOL
Definition: regexp.cpp:304
static char * regnode(char op)
Definition: regexp.cpp:766
#define text
Definition: plot.cpp:81
static int regrepeat(char *p)
Definition: regexp.cpp:1151
static regexp * regcomp(const char *exp)
Definition: regexp.cpp:401
#define min(a, b)
Definition: matrix.h:157
#define NSUBEXP
Definition: regexp.h:43
void
#define ANYBUT
Definition: regexp.cpp:308
size_t p
char program[1]
Definition: regexp.h:52
#define EOL
Definition: regexp.cpp:305
static void reginsert(char op, char *opnd)
Definition: regexp.cpp:802
#define STAR
Definition: regexp.cpp:313
Item * next(Item *item)
Definition: list.cpp:95
char * pattern_
Definition: regexp.h:73
#define OPERAND(p)
Definition: regexp.cpp:353
static char ** regstartp
Definition: regexp.cpp:873
#define OPEN
Definition: regexp.cpp:315
#define NOTHING
Definition: regexp.cpp:312
static char * reginput
Definition: regexp.cpp:871
#define OP(p)
Definition: regexp.cpp:351
static void regtail(char *p, char *val)
Definition: regexp.cpp:828
~Regexp()
Definition: regexp.cpp:100
char * NextLine(char *s)
Definition: regexp.cpp:69
char reganch
Definition: regexp.h:49
char * startp[NSUBEXP]
Definition: regexp.h:45
Inst * prog
Definition: code.cpp:136
static void regerror(const char *s)
Definition: regexp.cpp:1212
char * endp[NSUBEXP]
Definition: regexp.h:46
#define SPSTART
Definition: regexp.cpp:373
_CONST char * s
Definition: system.cpp:74
#define PLUS
Definition: regexp.cpp:314
int val
Definition: dll.cpp:167
regexp * c_pattern
Definition: regexp.h:74
#define ret
Definition: redef.h:123
static long regsize
Definition: regexp.cpp:383
static int regexec(regexp *prog, char *string)
Definition: regexp.cpp:880
static int regtry(regexp *prog, char *string)
Definition: regexp.cpp:938
static void regoptail(char *p, char *val)
Definition: regexp.cpp:857
static int regmatch(char *prog)
Definition: regexp.cpp:972
static char * reg(int paren, int *flagp)
Definition: regexp.cpp:486
static char * regnext(char *p)
Definition: regexp.cpp:1195
static char * regbranch(int *flagp)
Definition: regexp.cpp:555
exp
Definition: extdef.h:3
static char ** regendp
Definition: regexp.cpp:874
#define WORST
Definition: regexp.cpp:374
#define nil
Definition: enter-scope.h:36
#define UCHARAT(p)
Definition: regexp.cpp:362
int BeginningOfMatch(int subexp=0)
Definition: regexp.cpp:227
static char * regbol
Definition: regexp.cpp:872
#define END
Definition: regexp.cpp:303
#define ISMULT(c)
Definition: regexp.cpp:365
char * FindNewline(char *s)
Definition: regexp.cpp:64
#define REGEXP_MAGIC
Definition: regexp.h:59
#define FAIL(m)
Definition: regexp.cpp:364
char regstart
Definition: regexp.h:48
#define HASWIDTH
Definition: regexp.cpp:371
#define ANY
Definition: regexp.cpp:306
static double save(void *v)
Definition: ocbox.cpp:340
const char * pattern() const
Definition: regexp.cpp:109
#define BRANCH
Definition: regexp.cpp:309
#define i
Definition: md1redef.h:12
static const char * regparse
Definition: regexp.cpp:379
#define META
Definition: regexp.cpp:366
#define SIMPLE
Definition: regexp.cpp:372
#define ANYOF
Definition: regexp.cpp:307
static void regc(char b)
Definition: regexp.cpp:789
static char * regpiece(int *flagp)
Definition: regexp.cpp:596
static struct @32 flags[]
int Match(const char *text, int length, int index)
Definition: regexp.cpp:206
#define NEXT(p)
Definition: regexp.cpp:352
int Search(const char *text, int length, int index, int range)
Definition: regexp.cpp:111
int EndOfMatch(int subexp=0)
Definition: regexp.cpp:234
static int regnpar
Definition: regexp.cpp:380
int regmlen
Definition: regexp.h:51
Definition: regexp.h:44
#define EXACTLY
Definition: regexp.cpp:311
short index
Definition: cabvars.h:11
static char * regatom(int *flagp)
Definition: regexp.cpp:658
char * textStart
Definition: regexp.h:47