NEURON
regexp.cpp
Go to the documentation of this file.
1 #ifdef HAVE_CONFIG_H
2 #include <../../nrnconf.h>
3 #endif
4 /*
5  * Copyright (c) 1987, 1988, 1989, 1990, 1991 Stanford University
6  * Copyright (c) 1991 Silicon Graphics, Inc.
7  *
8  * Permission to use, copy, modify, distribute, and sell this software and
9  * its documentation for any purpose is hereby granted without fee, provided
10  * that (i) the above copyright notices and this permission notice appear in
11  * all copies of the software and related documentation, and (ii) the names of
12  * Stanford and Silicon Graphics may not be used in any advertising or
13  * publicity relating to the software without the specific, prior written
14  * permission of Stanford and Silicon Graphics.
15  *
16  * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
17  * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
18  * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
19  *
20  * IN NO EVENT SHALL STANFORD OR SILICON GRAPHICS BE LIABLE FOR
21  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
22  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
23  * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
24  * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
25  * OF THIS SOFTWARE.
26  */
27 
28 /*
29  * Regexp - regular expression searching
30  */
31 
32 #include <InterViews/regexp.h>
33 #include <ivstream.h>
34 #include <string.h>
35 
36 /*
37  * This version is based on the Henry Spencers public domain reimplementation
38  * of the regular expression matching subroutines. They are included as
39  * static subroutines after the externally accessible routines.
40  */
41 
42 /*
43  * Forward declarations for regcomp()'s friends.
44  */
45 static regexp* regcomp(const char* exp);
46 static char* reg(int paren, int* flagp);
47 static char* regbranch(int* flagp);
48 static char* regpiece(int* flagp);
49 static char* regatom(int* flagp);
50 static char* regnode(char op);
51 static char* regnext(char* p);
52 static void regc(char b);
53 static void reginsert(char op, char* opnd);
54 static void regtail(char* p, char* val);
55 static void regoptail(char* p, char* val);
56 static void regerror(const char* s);
57 static int regexec(regexp* prog, char* string);
58 static int regtry(regexp* prog, char* string);
59 static int regmatch(char* prog);
60 static int regrepeat(char* p);
61 
62 
63 inline char *
64 FindNewline(char* s) {
65  return strchr(s, '\n');
66 }
67 
68 inline char *
69 NextLine(char* s) {
70  char* newstart;
71 
72  if ((newstart = FindNewline(s)) != nil)
73  newstart++;
74  return newstart;
75 }
76 
77 Regexp::Regexp (const char* pat) {
78  int length = strlen(pat);
79  pattern_ = new char[length+1];
80  strncpy(pattern_, pat, length);
81  pattern_[length] = '\0';
83  if (!c_pattern) {
84  delete [] pattern_;
85  pattern_ = nil;
86  }
87 }
88 
89 Regexp::Regexp (const char* pat, int length) {
90  pattern_ = new char[length+1];
91  strncpy(pattern_, pat, length);
92  pattern_[length] = '\0';
94  if (!c_pattern) {
95  delete [] pattern_;
96  pattern_ = nil;
97  }
98 }
99 
101  if (pattern_) {
102  delete [] pattern_;
103  }
104  if (c_pattern) {
105  delete [] c_pattern;
106  }
107 }
108 
109 const char* Regexp::pattern() const { return pattern_; }
110 
111 int Regexp::Search (const char* text, int length, int index, int range) {
112  bool forwardSearch;
113  bool frontAnchored;
114  bool endAnchored;
115  char* searchStart;
116  char* searchLimit;
117  char* endOfLine = nil;
118  char* lastMatch = nil;
119  char csave;
120 
121  /*
122  * A small sanity check. Otherwise length is unused in this function.
123  * This is really what the logic embedded in the old version of this
124  * routine enforced.
125  */
126  if (index + range > length) {
127  range = length - index;
128  if (range < 0)
129  return -1;
130  }
131 
132  if (c_pattern == nil) {
133  return -1;
134  }
135 
136  c_pattern->startp[0] = nil;
137 
138  if (range < 0) {
139  forwardSearch = false;
140  searchLimit = (char *) text + index;
141  searchStart = (char *) searchLimit + range; /* range is negative */
142  } else {
143  forwardSearch = true;
144  searchStart = (char *) text + index;
145  searchLimit = (char *) searchStart + range;
146  }
147 
148  /* Mark end of text string so search will stop */
149  char save = *searchLimit;
150  *searchLimit = '\0';
151 
152  frontAnchored = pattern_[0] == '^';
153  endAnchored = pattern_[strlen(pattern_)-1] == '$';
154  if (frontAnchored && (searchStart != text || searchStart[-1] == '\n')) {
155  searchStart = NextLine(searchStart);
156  }
157 
158  while (searchStart && searchStart < searchLimit) {
159  int result;
160 
161  if (endAnchored && (endOfLine = FindNewline(searchStart)) != nil) {
162  csave = *endOfLine;
163  *endOfLine = '\0';
164  }
165 
166  result = regexec(c_pattern, searchStart);
167 
168  if (endOfLine)
169  *endOfLine = csave;
170 
171  if (result) {
172  /* Found a match */
173  if (forwardSearch)
174  break; /* Done */
175  else {
176  lastMatch = c_pattern->startp[0];
177  searchStart = c_pattern->endp[0];
178  if (frontAnchored)
179  searchStart = NextLine(searchStart);
180  continue;
181  }
182  }
183  /* Did not find a match */
184  if (frontAnchored || endAnchored)
185  searchStart = NextLine(searchStart);
186  else
187  break;
188  }
189 
190  if (!forwardSearch && lastMatch) {
191  if (endAnchored && (endOfLine = FindNewline(lastMatch)) != nil) {
192  csave = *endOfLine;
193  *endOfLine = '\0';
194  }
195  (void) regexec(c_pattern, lastMatch); /* Refill startp and endp */
196  if (endOfLine)
197  *endOfLine = csave;
198  }
199 
200  *searchLimit = save;
201  c_pattern->textStart = (char *) text;
202 
203  return c_pattern->startp[0] - c_pattern->textStart;
204 }
205 
206 int Regexp::Match (const char* text, int length, int index) {
207 
208  if (c_pattern == nil)
209  return -1;
210 
211  c_pattern->startp[0] = nil;
212 
213  char save = *(text+length);
214  *(char*)(text+length) = '\0';
215 
216  c_pattern->textStart = (char *) text;
217  (void) regexec(c_pattern, (char *) text + index);
218 
219  *(char*)(text+length) = save;
220 
221  if (c_pattern->startp[0] != nil)
222  return c_pattern->endp[0] - c_pattern->startp[0];
223  else
224  return -1;
225 }
226 
227 int Regexp::BeginningOfMatch (int subexp) {
228  if (subexp < 0 || subexp > NSUBEXP ||
229  c_pattern == nil || c_pattern->startp[0] == nil)
230  return -1;
231  return c_pattern->startp[subexp] - c_pattern->textStart;
232 }
233 
234 int Regexp::EndOfMatch (int subexp) {
235  if (subexp < 0 || subexp > NSUBEXP ||
236  c_pattern == nil || c_pattern->startp[0] == nil)
237  return -1;
238  return c_pattern->endp[subexp] - c_pattern->textStart;
239 }
240 
241 /*
242  * regcomp and regexec
243  *
244  * Copyright (c) 1986 by University of Toronto.
245  * Written by Henry Spencer. Not derived from licensed software.
246  *
247  * Permission is granted to anyone to use this software for any
248  * purpose on any computer system, and to redistribute it freely,
249  * subject to the following restrictions:
250  *
251  * 1. The author is not responsible for the consequences of use of
252  * this software, no matter how awful, even if they arise
253  * from defects in it.
254  *
255  * 2. The origin of this software must not be misrepresented, either
256  * by explicit claim or by omission.
257  *
258  * 3. Altered versions must be plainly marked as such, and must not
259  * be misrepresented as being the original software.
260  *
261  * Beware that some of this code is subtly aware of the way operator
262  * precedence is structured in regular expressions. Serious changes in
263  * regular-expression syntax might require a total rethink.
264  */
265 
266 /*
267  * The "internal use only" fields in regexp.h are present to pass info from
268  * compile to execute that permits the execute phase to run lots faster on
269  * simple cases. They are:
270  *
271  * regstart char that must begin a match; '\0' if none obvious
272  * reganch is the match anchored (at beginning-of-line only)?
273  * regmust string (pointer into program) that match must include, or nil
274  * regmlen length of regmust string
275  *
276  * Regstart and reganch permit very fast decisions on suitable starting points
277  * for a match, cutting down the work a lot. Regmust permits fast rejection
278  * of lines that cannot possibly match. The regmust tests are costly enough
279  * that regcomp() supplies a regmust only if the r.e. contains something
280  * potentially expensive (at present, the only such thing detected is * or +
281  * at the start of the r.e., which can involve a lot of backup). Regmlen is
282  * supplied because the test in regexec() needs it and regcomp() is computing
283  * it anyway.
284  */
285 
286 /*
287  * Structure for regexp "program". This is essentially a linear encoding
288  * of a nondeterministic finite-state machine (aka syntax charts or
289  * "railroad normal form" in parsing technology). Each node is an opcode
290  * plus a "next" pointer, possibly plus an operand. "Next" pointers of
291  * all nodes except BRANCH implement concatenation; a "next" pointer with
292  * a BRANCH on both ends of it is connecting two alternatives. (Here we
293  * have one of the subtle syntax dependencies: an individual BRANCH (as
294  * opposed to a collection of them) is never concatenated with anything
295  * because of operator precedence.) The operand of some types of node is
296  * a literal string; for others, it is a node leading into a sub-FSM. In
297  * particular, the operand of a BRANCH node is the first node of the branch.
298  * (NB this is *not* a tree structure: the tail of the branch connects
299  * to the thing following the set of BRANCHes.) The opcodes are:
300  */
301 
302 /* definition number opnd? meaning */
303 #define END 0 /* no End of program. */
304 #define BOL 1 /* no Match "" at beginning of line. */
305 #define EOL 2 /* no Match "" at end of line. */
306 #define ANY 3 /* no Match any one character. */
307 #define ANYOF 4 /* str Match any character in this string. */
308 #define ANYBUT 5 /* str Match any character not in this string. */
309 #define BRANCH 6 /* node Match this alternative, or the next... */
310 #define BACK 7 /* no Match "", "next" ptr points backward. */
311 #define EXACTLY 8 /* str Match this string. */
312 #define NOTHING 9 /* no Match empty string. */
313 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
314 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
315 #define OPEN 20 /* no Mark this point in input as start of #n. */
316  /* OPEN+1 is number 1, etc. */
317 #define CLOSE 30 /* no Analogous to OPEN. */
318 
319 /*
320  * Opcode notes:
321  *
322  * BRANCH The set of branches constituting a single choice are hooked
323  * together with their "next" pointers, since precedence prevents
324  * anything being concatenated to any individual branch. The
325  * "next" pointer of the last BRANCH in a choice points to the
326  * thing following the whole choice. This is also where the
327  * final "next" pointer of each individual branch points; each
328  * branch starts with the operand node of a BRANCH node.
329  *
330  * BACK Normal "next" pointers all implicitly point forward; BACK
331  * exists to make loop structures possible.
332  *
333  * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
334  * BRANCH structures using BACK. Simple cases (one character
335  * per match) are implemented with STAR and PLUS for speed
336  * and to minimize recursive plunges.
337  *
338  * OPEN,CLOSE ...are numbered at compile time.
339  */
340 
341 /*
342  * A node is one char of opcode followed by two chars of "next" pointer.
343  * "Next" pointers are stored as two 8-bit pieces, high order first. The
344  * value is a positive offset from the opcode of the node containing it.
345  * An operand, if any, simply follows the node. (Note that much of the
346  * code generation knows about this implicit relationship.)
347  *
348  * Using two bytes for the "next" pointer is vast overkill for most things,
349  * but allows patterns to get big without disasters.
350  */
351 #define OP(p) (*(p))
352 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
353 #define OPERAND(p) ((p) + 3)
354 
355 /*
356  * Utility definitions.
357  */
358 
359 /**
360  * This replaces a macro of the same name with some bit manipulation magic in
361  * it. The does not seem well-suited now, but it's not clear that it was before
362  * either.
363  */
364 inline int UCHARAT(const char* p) {
365  return *p;
366 }
367 
368 #define FAIL(m) { regerror(m); return(nil); }
369 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
370 #define META "^$.[()|?+*\\"
371 
372 /*
373  * Flags to be passed up and down.
374  */
375 #define HASWIDTH 01 /* Known never to match null string. */
376 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
377 #define SPSTART 04 /* Starts with * or +. */
378 #define WORST 0 /* Worst case. */
379 
380 /*
381  * Global work variables for regcomp().
382  */
383 static const char *regparse; /* Input-scan pointer. */
384 static int regnpar; /* () count. */
385 static char regdummy;
386 static char *regcode; /* Code-emit pointer; &regdummy = don't. */
387 static long regsize; /* Code size. */
388 
389 /*
390  - regcomp - compile a regular expression into internal code
391  *
392  * We can't allocate space until we know how big the compiled form will be,
393  * but we can't compile it (and thus know how big it is) until we've got a
394  * place to put the code. So we cheat: we compile it twice, once with code
395  * generation turned off and size counting turned on, and once "for real".
396  * This also means that we don't allocate space until we are sure that the
397  * thing really will compile successfully, and we never have to move the
398  * code and thus invalidate pointers into it. (Note that it has to be in
399  * one piece because free() must be able to free it all.)
400  *
401  * Beware that the optimization-preparation code in here knows about some
402  * of the structure of the compiled regexp.
403  */
404 static regexp *
405 regcomp(const char* exp) {
406  regexp *r;
407  char *scan;
408  char *longest;
409  int len;
410  int flags;
411 
412  if (exp == nil)
413  FAIL("nil argument");
414 
415  /* First pass: determine size, legality. */
416  regparse = exp;
417  regnpar = 1;
418  regsize = 0L;
419  regcode = &regdummy;
421  if (reg(0, &flags) == nil)
422  return(nil);
423 
424  /* Small enough for pointer-storage convention? */
425  if (regsize >= 32767L) /* Probably could be 65535L. */
426  FAIL("regexp too big");
427 
428  /* Allocate space. */
429  r = (regexp *) new char[sizeof(regexp) + (unsigned)regsize];
430 
431  /* Second pass: emit code. */
432  regparse = exp;
433  regnpar = 1;
434  regcode = r->program;
436  if (reg(0, &flags) == nil) {
437  delete [] r;
438  return(nil);
439  }
440 
441  /* Dig out information for optimizations. */
442  r->regstart = '\0'; /* Worst-case defaults. */
443  r->reganch = 0;
444  r->regmust = nil;
445  r->regmlen = 0;
446  scan = r->program+1; /* First BRANCH. */
447  if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
448  scan = OPERAND(scan);
449 
450  /* Starting-point info. */
451  if (OP(scan) == EXACTLY)
452  r->regstart = *OPERAND(scan);
453  else if (OP(scan) == BOL)
454  r->reganch++;
455 
456  /*
457  * If there's something expensive in the r.e., find the
458  * longest literal string that must appear and make it the
459  * regmust. Resolve ties in favor of later strings, since
460  * the regstart check works with the beginning of the r.e.
461  * and avoiding duplication strengthens checking. Not a
462  * strong reason, but sufficient in the absence of others.
463  */
464  if (flags&SPSTART) {
465  longest = nil;
466  len = 0;
467  for (; scan != nil; scan = regnext(scan))
468  if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
469  longest = OPERAND(scan);
470  len = strlen(OPERAND(scan));
471  }
472  r->regmust = longest;
473  r->regmlen = len;
474  }
475  }
476 
477  return(r);
478 }
479 
480 /*
481  - reg - regular expression, i.e. main body or parenthesized thing
482  *
483  * Caller must absorb opening parenthesis.
484  *
485  * Combining parenthesis handling with the base level of regular expression
486  * is a trifle forced, but the need to tie the tails of the branches to what
487  * follows makes it hard to avoid.
488  */
489 static char *
490 reg(int paren, int* flagp) {
491  char *ret;
492  char *br;
493  char *ender;
494  int parno;
495  int flags;
496 
497  *flagp = HASWIDTH; /* Tentatively. */
498 
499  /* Make an OPEN node, if parenthesized. */
500  if (paren) {
501  if (regnpar >= NSUBEXP)
502  FAIL("too many ()");
503  parno = regnpar;
504  regnpar++;
505  ret = regnode(OPEN+parno);
506  } else
507  ret = nil;
508 
509  /* Pick up the branches, linking them together. */
510  br = regbranch(&flags);
511  if (br == nil)
512  return(nil);
513  if (ret != nil)
514  regtail(ret, br); /* OPEN -> first. */
515  else
516  ret = br;
517  if (!(flags&HASWIDTH))
518  *flagp &= ~HASWIDTH;
519  *flagp |= flags&SPSTART;
520  while (*regparse == '|') {
521  regparse++;
522  br = regbranch(&flags);
523  if (br == nil)
524  return(nil);
525  regtail(ret, br); /* BRANCH -> BRANCH. */
526  if (!(flags&HASWIDTH))
527  *flagp &= ~HASWIDTH;
528  *flagp |= flags&SPSTART;
529  }
530 
531  /* Make a closing node, and hook it on the end. */
532  ender = regnode((paren) ? CLOSE+parno : END);
533  regtail(ret, ender);
534 
535  /* Hook the tails of the branches to the closing node. */
536  for (br = ret; br != nil; br = regnext(br))
537  regoptail(br, ender);
538 
539  /* Check for proper termination. */
540  if (paren && *regparse++ != ')') {
541  FAIL("unmatched ()");
542  } else if (!paren && *regparse != '\0') {
543  if (*regparse == ')') {
544  FAIL("unmatched ()");
545  } else
546  FAIL("junk on end"); /* "Can't happen". */
547  /* NOTREACHED */
548  }
549 
550  return(ret);
551 }
552 
553 /*
554  - regbranch - one alternative of an | operator
555  *
556  * Implements the concatenation operator.
557  */
558 static char *
559 regbranch(int* flagp) {
560  char *ret;
561  char *chain;
562  char *latest;
563  int flags;
564 
565  *flagp = WORST; /* Tentatively. */
566 
567  ret = regnode(BRANCH);
568  chain = nil;
569  while (*regparse != '\0' && *regparse != '|') {
570  if (*regparse == '\\' && regparse[1] == ')') {
571  regparse++;
572  break;
573  }
574  latest = regpiece(&flags);
575  if (latest == nil)
576  return(nil);
577  *flagp |= flags&HASWIDTH;
578  if (chain == nil) /* First piece. */
579  *flagp |= flags&SPSTART;
580  else
581  regtail(chain, latest);
582  chain = latest;
583  }
584  if (chain == nil) /* Loop ran zero times. */
585  (void) regnode(NOTHING);
586 
587  return(ret);
588 }
589 
590 /*
591  - regpiece - something followed by possible [*+?]
592  *
593  * Note that the branching code sequences used for ? and the general cases
594  * of * and + are somewhat optimized: they use the same NOTHING node as
595  * both the endmarker for their branch list and the body of the last branch.
596  * It might seem that this node could be dispensed with entirely, but the
597  * endmarker role is not redundant.
598  */
599 static char *
600 regpiece(int* flagp) {
601  char *ret;
602  char op;
603  char *next;
604  int flags;
605 
606  ret = regatom(&flags);
607  if (ret == nil)
608  return(nil);
609 
610  op = *regparse;
611  if (!ISMULT(op)) {
612  *flagp = flags;
613  return(ret);
614  }
615 
616  if (!(flags&HASWIDTH) && op != '?')
617  FAIL("*+ operand could be empty");
618  *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
619 
620  if (op == '*' && (flags&SIMPLE))
621  reginsert(STAR, ret);
622  else if (op == '*') {
623  /* Emit x* as (x&|), where & means "self". */
624  reginsert(BRANCH, ret); /* Either x */
625  regoptail(ret, regnode(BACK)); /* and loop */
626  regoptail(ret, ret); /* back */
627  regtail(ret, regnode(BRANCH)); /* or */
628  regtail(ret, regnode(NOTHING)); /* null. */
629  } else if (op == '+' && (flags&SIMPLE))
630  reginsert(PLUS, ret);
631  else if (op == '+') {
632  /* Emit x+ as x(&|), where & means "self". */
633  next = regnode(BRANCH); /* Either */
634  regtail(ret, next);
635  regtail(regnode(BACK), ret); /* loop back */
636  regtail(next, regnode(BRANCH)); /* or */
637  regtail(ret, regnode(NOTHING)); /* null. */
638  } else if (op == '?') {
639  /* Emit x? as (x|) */
640  reginsert(BRANCH, ret); /* Either x */
641  regtail(ret, regnode(BRANCH)); /* or */
642  next = regnode(NOTHING); /* null. */
643  regtail(ret, next);
644  regoptail(ret, next);
645  }
646  regparse++;
647  if (ISMULT(*regparse))
648  FAIL("nested *?+");
649 
650  return(ret);
651 }
652 
653 /*
654  - regatom - the lowest level
655  *
656  * Optimization: gobbles an entire sequence of ordinary characters so that
657  * it can turn them into a single node, which is smaller to store and
658  * faster to run. Backslashed characters are exceptions, each becoming a
659  * separate node; the code is simpler that way and it's not worth fixing.
660  */
661 static char *
662 regatom(int* flagp) {
663  char *ret;
664  int flags;
665 
666  *flagp = WORST; /* Tentatively. */
667 
668  switch (*regparse++) {
669  case '^':
670  ret = regnode(BOL);
671  break;
672  case '$':
673  ret = regnode(EOL);
674  break;
675  case '.':
676  ret = regnode(ANY);
677  *flagp |= HASWIDTH|SIMPLE;
678  break;
679  case '[': {
680  int classbeg;
681  int classend;
682 
683  if (*regparse == '^') { /* Complement of range. */
684  ret = regnode(ANYBUT);
685  regparse++;
686  } else
687  ret = regnode(ANYOF);
688  if (*regparse == ']' || *regparse == '-')
689  regc(*regparse++);
690  while (*regparse != '\0' && *regparse != ']') {
691  if (*regparse == '-') {
692  regparse++;
693  if (*regparse == ']' || *regparse == '\0')
694  regc('-');
695  else {
696  classbeg = UCHARAT(regparse-2)+1;
697  classend = UCHARAT(regparse);
698  if (classbeg > classend+1)
699  FAIL("invalid [] range");
700  for (; classbeg <= classend; classbeg++)
701  regc(classbeg);
702  regparse++;
703  }
704  } else
705  regc(*regparse++);
706  }
707  regc('\0');
708  if (*regparse != ']')
709  FAIL("unmatched []");
710  regparse++;
711  *flagp |= HASWIDTH|SIMPLE;
712  }
713  break;
714  case '\0':
715  case '|':
716  FAIL("internal urp"); /* Supposed to be caught earlier. */
717  break;
718  case '?':
719  case '+':
720  case '*':
721  FAIL("?+* follows nothing");
722  break;
723  case '\\':
724  if (*regparse == '\0')
725  FAIL("trailing \\");
726  if (*regparse == '(') {
727  regparse++;
728  ret = reg(1, &flags);
729  if (ret == nil)
730  return(nil);
731  *flagp |= flags&(HASWIDTH|SPSTART);
732  } else {
733  ret = regnode(EXACTLY);
734  regc(*regparse++);
735  regc('\0');
736  *flagp |= HASWIDTH|SIMPLE;
737  }
738  break;
739  default: {
740  int len;
741  char ender;
742 
743  regparse--;
744  len = strcspn(regparse, META);
745  if (len <= 0)
746  FAIL("internal disaster");
747  ender = *(regparse+len);
748  if (len > 1 && ISMULT(ender))
749  len--; /* Back off clear of ?+* operand. */
750  *flagp |= HASWIDTH;
751  if (len == 1)
752  *flagp |= SIMPLE;
753  ret = regnode(EXACTLY);
754  while (len > 0) {
755  regc(*regparse++);
756  len--;
757  }
758  regc('\0');
759  }
760  break;
761  }
762 
763  return(ret);
764 }
765 
766 /*
767  - regnode - emit a node
768  */
769 static char * /* Location. */
770 regnode(char op) {
771  char *ret;
772  char *ptr;
773 
774  ret = regcode;
775  if (ret == &regdummy) {
776  regsize += 3;
777  return(ret);
778  }
779 
780  ptr = ret;
781  *ptr++ = op;
782  *ptr++ = '\0'; /* Null "next" pointer. */
783  *ptr++ = '\0';
784  regcode = ptr;
785 
786  return(ret);
787 }
788 
789 /*
790  - regc - emit (if appropriate) a byte of code
791  */
792 static void
793 regc(char b) {
794  if (regcode != &regdummy)
795  *regcode++ = b;
796  else
797  regsize++;
798 }
799 
800 /*
801  - reginsert - insert an operator in front of already-emitted operand
802  *
803  * Means relocating the operand.
804  */
805 static void
806 reginsert(char op, char* opnd) {
807  char *src;
808  char *dst;
809  char *place;
810 
811  if (regcode == &regdummy) {
812  regsize += 3;
813  return;
814  }
815 
816  src = regcode;
817  regcode += 3;
818  dst = regcode;
819  while (src > opnd)
820  *--dst = *--src;
821 
822  place = opnd; /* Op node, where operand used to be. */
823  *place++ = op;
824  *place++ = '\0';
825  *place++ = '\0';
826 }
827 
828 /*
829  - regtail - set the next-pointer at the end of a node chain
830  */
831 static void
832 regtail(char* p, char* val) {
833  char *scan;
834  char *temp;
835  int offset;
836 
837  if (p == &regdummy)
838  return;
839 
840  /* Find last node. */
841  scan = p;
842  for (;;) {
843  temp = regnext(scan);
844  if (temp == nil)
845  break;
846  scan = temp;
847  }
848 
849  if (OP(scan) == BACK)
850  offset = scan - val;
851  else
852  offset = val - scan;
853  *(scan+1) = (offset>>8)&0377;
854  *(scan+2) = offset&0377;
855 }
856 
857 /*
858  - regoptail - regtail on operand of first argument; nop if operandless
859  */
860 static void
861 regoptail(char* p, char* val) {
862  /* "Operandless" and "op != BRANCH" are synonymous in practice. */
863  if (p == nil || p == &regdummy || OP(p) != BRANCH)
864  return;
865  regtail(OPERAND(p), val);
866 }
867 
868 /*
869  * regexec and friends
870  */
871 
872 /*
873  * Global work variables for regexec().
874  */
875 static char *reginput; /* String-input pointer. */
876 static char *regbol; /* Beginning of input, for ^ check. */
877 static char **regstartp; /* Pointer to startp array. */
878 static char **regendp; /* Ditto for endp. */
879 
880 /*
881  - regexec - match a regexp against a string
882  */
883 static int
884 regexec(regexp* prog, char* string) {
885  char *s;
886 
887  /* Be paranoid... */
888  if (prog == nil || string == nil) {
889  regerror("nil parameter");
890  return(0);
891  }
892 
893  /* Check validity of program. */
894  if (UCHARAT(prog->program) != REGEXP_MAGIC) {
895  regerror("corrupted program");
896  return(0);
897  }
898 
899  /* If there is a "must appear" string, look for it. */
900  if (prog->regmust != nil) {
901  s = string;
902  while ((s = strchr(s, prog->regmust[0])) != nil) {
903  if (strncmp(s, prog->regmust, prog->regmlen) == 0)
904  break; /* Found it. */
905  s++;
906  }
907  if (s == nil) /* Not present. */
908  return(0);
909  }
910 
911  /* Mark beginning of line for ^ . */
912  regbol = string;
913 
914  /* Simplest case: anchored match need be tried only once. */
915  if (prog->reganch)
916  return(regtry(prog, string));
917 
918  /* Messy cases: unanchored match. */
919  s = string;
920  if (prog->regstart != '\0')
921  /* We know what char it must start with. */
922  while ((s = strchr(s, prog->regstart)) != nil) {
923  if (regtry(prog, s))
924  return(1);
925  s++;
926  }
927  else
928  /* We don't -- general case. */
929  do {
930  if (regtry(prog, s))
931  return(1);
932  } while (*s++ != '\0');
933 
934  /* Failure. */
935  return(0);
936 }
937 
938 /*
939  - regtry - try match at specific point
940  */
941 static int /* 0 failure, 1 success */
942 regtry(regexp* prog, char* string) {
943  int i;
944  char **sp;
945  char **ep;
946 
947  reginput = string;
948  regstartp = prog->startp;
949  regendp = prog->endp;
950 
951  sp = prog->startp;
952  ep = prog->endp;
953  for (i = NSUBEXP; i > 0; i--) {
954  *sp++ = nil;
955  *ep++ = nil;
956  }
957  if (regmatch(prog->program + 1)) {
958  prog->startp[0] = string;
959  prog->endp[0] = reginput;
960  return(1);
961  } else
962  return(0);
963 }
964 
965 /*
966  - regmatch - main matching routine
967  *
968  * Conceptually the strategy is simple: check to see whether the current
969  * node matches, call self recursively to see whether the rest matches,
970  * and then act accordingly. In practice we make some effort to avoid
971  * recursion, in particular by going through "ordinary" nodes (that don't
972  * need to know whether the rest of the match failed) by a loop instead of
973  * by recursion.
974  */
975 static int /* 0 failure, 1 success */
976 regmatch(char* prog) {
977  char *scan; /* Current node. */
978  char *next; /* Next node. */
979 
980  scan = prog;
981  while (scan != nil) {
982  next = regnext(scan);
983 
984  switch (OP(scan)) {
985  case BOL:
986  if (reginput != regbol)
987  return(0);
988  break;
989  case EOL:
990  if (*reginput != '\0')
991  return(0);
992  break;
993  case ANY:
994  if (*reginput == '\0')
995  return(0);
996  reginput++;
997  break;
998  case EXACTLY: {
999  int len;
1000  char *opnd;
1001 
1002  opnd = OPERAND(scan);
1003  /* Inline the first character, for speed. */
1004  if (*opnd != *reginput)
1005  return(0);
1006  len = strlen(opnd);
1007  if (len > 1 && strncmp(opnd, reginput, len) != 0)
1008  return(0);
1009  reginput += len;
1010  }
1011  break;
1012  case ANYOF:
1013  if (*reginput == '\0')
1014  return(0);
1015  if (strchr(OPERAND(scan), *reginput) == nil)
1016  return(0);
1017  reginput++;
1018  break;
1019  case ANYBUT:
1020  if (*reginput == '\0')
1021  return(0);
1022  if (strchr(OPERAND(scan), *reginput) != nil)
1023  return(0);
1024  reginput++;
1025  break;
1026  case NOTHING:
1027  break;
1028  case BACK:
1029  break;
1030  case OPEN+1:
1031  case OPEN+2:
1032  case OPEN+3:
1033  case OPEN+4:
1034  case OPEN+5:
1035  case OPEN+6:
1036  case OPEN+7:
1037  case OPEN+8:
1038  case OPEN+9: {
1039  int no;
1040  char *save;
1041 
1042  no = OP(scan) - OPEN;
1043  save = reginput;
1044 
1045  if (regmatch(next)) {
1046  /*
1047  * Don't set startp if some later
1048  * invocation of the same parentheses
1049  * already has.
1050  */
1051  if (regstartp[no] == nil)
1052  regstartp[no] = save;
1053  return(1);
1054  } else
1055  return(0);
1056  }
1057  break;
1058  case CLOSE+1:
1059  case CLOSE+2:
1060  case CLOSE+3:
1061  case CLOSE+4:
1062  case CLOSE+5:
1063  case CLOSE+6:
1064  case CLOSE+7:
1065  case CLOSE+8:
1066  case CLOSE+9: {
1067  int no;
1068  char *save;
1069 
1070  no = OP(scan) - CLOSE;
1071  save = reginput;
1072 
1073  if (regmatch(next)) {
1074  /*
1075  * Don't set endp if some later
1076  * invocation of the same parentheses
1077  * already has.
1078  */
1079  if (regendp[no] == nil)
1080  regendp[no] = save;
1081  return(1);
1082  } else
1083  return(0);
1084  }
1085  break;
1086  case BRANCH: {
1087  char *save;
1088 
1089  if (OP(next) != BRANCH) /* No choice. */
1090  next = OPERAND(scan); /* Avoid recursion. */
1091  else {
1092  do {
1093  save = reginput;
1094  if (regmatch(OPERAND(scan)))
1095  return(1);
1096  reginput = save;
1097  scan = regnext(scan);
1098  } while (scan != nil && OP(scan) == BRANCH);
1099  return(0);
1100  /* NOTREACHED */
1101  }
1102  }
1103  break;
1104  case STAR:
1105  case PLUS: {
1106  char nextch;
1107  int no;
1108  char *save;
1109  int min;
1110 
1111  /*
1112  * Lookahead to avoid useless match attempts
1113  * when we know what character comes next.
1114  */
1115  nextch = '\0';
1116  if (OP(next) == EXACTLY)
1117  nextch = *OPERAND(next);
1118  min = (OP(scan) == STAR) ? 0 : 1;
1119  save = reginput;
1120  no = regrepeat(OPERAND(scan));
1121  while (no >= min) {
1122  /* If it could work, try it. */
1123  if (nextch == '\0' || *reginput == nextch)
1124  if (regmatch(next))
1125  return(1);
1126  /* Couldn't or didn't -- back up. */
1127  no--;
1128  reginput = save + no;
1129  }
1130  return(0);
1131  }
1132  break;
1133  case END:
1134  return(1); /* Success! */
1135  default:
1136  regerror("memory corruption");
1137  return(0);
1138  }
1139 
1140  scan = next;
1141  }
1142 
1143  /*
1144  * We get here only if there's trouble -- normally "case END" is
1145  * the terminating point.
1146  */
1147  regerror("corrupted pointers");
1148  return(0);
1149 }
1150 
1151 /*
1152  - regrepeat - repeatedly match something simple, report how many
1153  */
1154 static int
1155 regrepeat(char* p) {
1156  int count = 0;
1157  char *scan;
1158  char *opnd;
1159 
1160  scan = reginput;
1161  opnd = OPERAND(p);
1162  switch (OP(p)) {
1163  case ANY:
1164  count = strlen(scan);
1165  scan += count;
1166  break;
1167  case EXACTLY:
1168  while (*opnd == *scan) {
1169  count++;
1170  scan++;
1171  }
1172  break;
1173  case ANYOF:
1174  while (*scan != '\0' && strchr(opnd, *scan) != nil) {
1175  count++;
1176  scan++;
1177  }
1178  break;
1179  case ANYBUT:
1180  while (*scan != '\0' && strchr(opnd, *scan) == nil) {
1181  count++;
1182  scan++;
1183  }
1184  break;
1185  default: /* Oh dear. Called inappropriately. */
1186  regerror("internal foulup");
1187  count = 0; /* Best compromise. */
1188  break;
1189  }
1190  reginput = scan;
1191 
1192  return(count);
1193 }
1194 
1195 /*
1196  - regnext - dig the "next" pointer out of a node
1197  */
1198 static char *
1199 regnext(char* p) {
1200  int offset;
1201 
1202  if (p == &regdummy)
1203  return(nil);
1204 
1205  offset = NEXT(p);
1206  if (offset == 0)
1207  return(nil);
1208 
1209  if (OP(p) == BACK)
1210  return(p-offset);
1211  else
1212  return(p+offset);
1213 }
1214 
1215 static void
1216 regerror(const char* s) {
1217  std::cerr << "regexp: " << s << "\n";
1218 }
1219 
#define nil
Definition: enter-scope.h:36
short index
Definition: cabvars.h:10
regexp * c_pattern
Definition: regexp.h:77
int Match(const char *text, int length, int index)
Definition: regexp.cpp:206
int EndOfMatch(int subexp=0)
Definition: regexp.cpp:234
int BeginningOfMatch(int subexp=0)
Definition: regexp.cpp:227
char * pattern_
Definition: regexp.h:76
int Search(const char *text, int length, int index, int range)
Definition: regexp.cpp:111
~Regexp()
Definition: regexp.cpp:100
Regexp(const char *)
Definition: regexp.cpp:77
const char * pattern() const
Definition: regexp.cpp:109
Inst * prog
Definition: code.cpp:143
void
static char ** regendp
Definition: regexp.cpp:878
static char * regnode(char op)
Definition: regexp.cpp:770
#define CLOSE
Definition: regexp.cpp:317
#define PLUS
Definition: regexp.cpp:314
static char * regpiece(int *flagp)
Definition: regexp.cpp:600
#define OPEN
Definition: regexp.cpp:315
#define OP(p)
Definition: regexp.cpp:351
static regexp * regcomp(const char *exp)
Definition: regexp.cpp:405
static int regtry(regexp *prog, char *string)
Definition: regexp.cpp:942
#define END
Definition: regexp.cpp:303
static void regtail(char *p, char *val)
Definition: regexp.cpp:832
static char * regnext(char *p)
Definition: regexp.cpp:1199
#define BOL
Definition: regexp.cpp:304
static char * regcode
Definition: regexp.cpp:386
#define EOL
Definition: regexp.cpp:305
#define META
Definition: regexp.cpp:370
#define WORST
Definition: regexp.cpp:378
#define BRANCH
Definition: regexp.cpp:309
static long regsize
Definition: regexp.cpp:387
char * FindNewline(char *s)
Definition: regexp.cpp:64
static char ** regstartp
Definition: regexp.cpp:877
static void reginsert(char op, char *opnd)
Definition: regexp.cpp:806
#define ANY
Definition: regexp.cpp:306
int UCHARAT(const char *p)
This replaces a macro of the same name with some bit manipulation magic in it.
Definition: regexp.cpp:364
static void regc(char b)
Definition: regexp.cpp:793
static int regnpar
Definition: regexp.cpp:384
static char * regbranch(int *flagp)
Definition: regexp.cpp:559
#define EXACTLY
Definition: regexp.cpp:311
static void regerror(const char *s)
Definition: regexp.cpp:1216
#define STAR
Definition: regexp.cpp:313
static int regrepeat(char *p)
Definition: regexp.cpp:1155
#define FAIL(m)
Definition: regexp.cpp:368
#define SPSTART
Definition: regexp.cpp:377
#define NOTHING
Definition: regexp.cpp:312
#define OPERAND(p)
Definition: regexp.cpp:353
static char * reg(int paren, int *flagp)
Definition: regexp.cpp:490
static char * reginput
Definition: regexp.cpp:875
#define BACK
Definition: regexp.cpp:310
static char regdummy
Definition: regexp.cpp:385
static char * regbol
Definition: regexp.cpp:876
static const char * regparse
Definition: regexp.cpp:383
static char * regatom(int *flagp)
Definition: regexp.cpp:662
#define ISMULT(c)
Definition: regexp.cpp:369
static int regexec(regexp *prog, char *string)
Definition: regexp.cpp:884
#define ANYOF
Definition: regexp.cpp:307
#define NEXT(p)
Definition: regexp.cpp:352
#define ANYBUT
Definition: regexp.cpp:308
char * NextLine(char *s)
Definition: regexp.cpp:69
#define HASWIDTH
Definition: regexp.cpp:375
#define SIMPLE
Definition: regexp.cpp:376
static void regoptail(char *p, char *val)
Definition: regexp.cpp:861
static int regmatch(char *prog)
Definition: regexp.cpp:976
#define min(a, b)
Definition: matrix.h:157
#define i
Definition: md1redef.h:12
exp
Definition: extdef.h:5
Item * next(Item *item)
Definition: list.cpp:88
size_t p
static double save(void *v)
Definition: ocbox.cpp:346
#define text
Definition: plot.cpp:85
#define ret
Definition: redef.h:123
#define NSUBEXP
Definition: regexp.h:43
#define REGEXP_MAGIC
Definition: regexp.h:62
Definition: regexp.h:44
char * startp[NSUBEXP]
Definition: regexp.h:45
char * regmust
Definition: regexp.h:50
char reganch
Definition: regexp.h:49
char program[1]
Definition: regexp.h:52
int regmlen
Definition: regexp.h:51
char * textStart
Definition: regexp.h:47
char * endp[NSUBEXP]
Definition: regexp.h:46
char regstart
Definition: regexp.h:48