1 : /*
2 : * The matching engine and friends. This file is #included by regexec.c
3 : * after suitable #defines of a variety of macros used herein, so that
4 : * different state representations can be used without duplicating masses
5 : * of code.
6 : */
7 :
8 : #ifdef SNAMES
9 : #define matcher smatcher
10 : #define fast sfast
11 : #define slow sslow
12 : #define dissect sdissect
13 : #define backref sbackref
14 : #define step sstep
15 : #define print sprint
16 : #define at sat
17 : #define match smat
18 : #endif
19 : #ifdef LNAMES
20 : #define matcher lmatcher
21 : #define fast lfast
22 : #define slow lslow
23 : #define dissect ldissect
24 : #define backref lbackref
25 : #define step lstep
26 : #define print lprint
27 : #define at lat
28 : #define match lmat
29 : #endif
30 :
31 : /* another structure passed up and down to avoid zillions of parameters */
32 : struct match {
33 : struct re_guts *g;
34 : int eflags;
35 : regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
36 : unsigned char *offp; /* offsets work from here */
37 : unsigned char *beginp; /* start of string -- virtual NUL precedes */
38 : unsigned char *endp; /* end of string -- virtual NUL here */
39 : unsigned char *coldp; /* can be no match starting before here */
40 : unsigned char **lastpos; /* [nplus+1] */
41 : STATEVARS;
42 : states st; /* current states */
43 : states fresh; /* states for a fresh start */
44 : states tmp; /* temporary */
45 : states empty; /* empty set of states */
46 : };
47 :
48 : #include "engine.ih"
49 :
50 : #ifdef REDEBUG
51 : #define SP(t, s, c) print(m, t, s, c, stdout)
52 : #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
53 : #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
54 : #else
55 : #define SP(t, s, c) /* nothing */
56 : #define AT(t, p1, p2, s1, s2) /* nothing */
57 : #define NOTE(s) /* nothing */
58 : #endif
59 :
60 : /*
61 : - matcher - the actual matching engine
62 : == static int matcher(register struct re_guts *g, char *string, \
63 : == size_t nmatch, regmatch_t pmatch[], int eflags);
64 : */
65 : static int /* 0 success, REG_NOMATCH failure */
66 : matcher(g, string, nmatch, pmatch, eflags)
67 : register struct re_guts *g;
68 : unsigned char *string;
69 : size_t nmatch;
70 : regmatch_t pmatch[];
71 : int eflags;
72 0 : {
73 : register unsigned char *endp;
74 : register size_t i;
75 : struct match mv;
76 0 : register struct match *m = &mv;
77 : register unsigned char *dp;
78 0 : const register sopno gf = g->firststate+1; /* +1 for OEND */
79 0 : const register sopno gl = g->laststate;
80 : unsigned char *start;
81 : unsigned char *stop;
82 :
83 : /* simplify the situation where possible */
84 0 : if (g->cflags®_NOSUB)
85 0 : nmatch = 0;
86 0 : if (eflags®_STARTEND) {
87 0 : start = string + pmatch[0].rm_so;
88 0 : stop = string + pmatch[0].rm_eo;
89 : } else {
90 0 : start = string;
91 0 : stop = start + strlen(start);
92 : }
93 0 : if (stop < start)
94 0 : return(REG_INVARG);
95 :
96 : /* prescreening; this does wonders for this rather slow code */
97 0 : if (g->must != NULL) {
98 0 : for (dp = start; dp < stop; dp++)
99 0 : if (*dp == g->must[0] && stop - dp >= g->mlen &&
100 : memcmp(dp, g->must, (size_t)g->mlen) == 0)
101 0 : break;
102 0 : if (dp == stop) /* we didn't find g->must */
103 0 : return(REG_NOMATCH);
104 : }
105 :
106 : /* match struct setup */
107 0 : m->g = g;
108 0 : m->eflags = eflags;
109 0 : m->pmatch = NULL;
110 0 : m->lastpos = NULL;
111 0 : m->offp = string;
112 0 : m->beginp = start;
113 0 : m->endp = stop;
114 0 : STATESETUP(m, 4);
115 0 : SETUP(m->st);
116 0 : SETUP(m->fresh);
117 0 : SETUP(m->tmp);
118 0 : SETUP(m->empty);
119 0 : CLEAR(m->empty);
120 :
121 : /* this loop does only one repetition except for backrefs */
122 : for (;;) {
123 0 : endp = fast(m, start, stop, gf, gl);
124 0 : if (endp == NULL) { /* a miss */
125 0 : STATETEARDOWN(m);
126 0 : return(REG_NOMATCH);
127 : }
128 0 : if (nmatch == 0 && !g->backrefs)
129 0 : break; /* no further info needed */
130 :
131 : /* where? */
132 : assert(m->coldp != NULL);
133 : for (;;) {
134 : NOTE("finding start");
135 0 : endp = slow(m, m->coldp, stop, gf, gl);
136 0 : if (endp != NULL)
137 0 : break;
138 : assert(m->coldp < m->endp);
139 0 : m->coldp++;
140 0 : }
141 0 : if (nmatch == 1 && !g->backrefs)
142 0 : break; /* no further info needed */
143 :
144 : /* oh my, he wants the subexpressions... */
145 0 : if (m->pmatch == NULL)
146 0 : m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
147 : sizeof(regmatch_t));
148 0 : if (m->pmatch == NULL) {
149 0 : STATETEARDOWN(m);
150 0 : return(REG_ESPACE);
151 : }
152 0 : for (i = 1; i <= m->g->nsub; i++)
153 0 : m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
154 0 : if (!g->backrefs && !(m->eflags®_BACKR)) {
155 : NOTE("dissecting");
156 0 : dp = dissect(m, m->coldp, endp, gf, gl);
157 : } else {
158 0 : if (g->nplus > 0 && m->lastpos == NULL)
159 0 : m->lastpos = (unsigned char **)malloc((g->nplus+1) *
160 : sizeof(unsigned char *));
161 0 : if (g->nplus > 0 && m->lastpos == NULL) {
162 0 : free((char *)m->pmatch);
163 0 : STATETEARDOWN(m);
164 0 : return(REG_ESPACE);
165 : }
166 : NOTE("backref dissect");
167 0 : dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
168 : }
169 0 : if (dp != NULL)
170 0 : break;
171 :
172 : /* uh-oh... we couldn't find a subexpression-level match */
173 : assert(g->backrefs); /* must be back references doing it */
174 : assert(g->nplus == 0 || m->lastpos != NULL);
175 : for (;;) {
176 0 : if (dp != NULL || endp <= m->coldp)
177 : break; /* defeat */
178 : NOTE("backoff");
179 0 : endp = slow(m, m->coldp, endp-1, gf, gl);
180 0 : if (endp == NULL)
181 0 : break; /* defeat */
182 : /* try it on a shorter possibility */
183 : #ifndef NDEBUG
184 : for (i = 1; i <= m->g->nsub; i++) {
185 : assert(m->pmatch[i].rm_so == -1);
186 : assert(m->pmatch[i].rm_eo == -1);
187 : }
188 : #endif
189 : NOTE("backoff dissect");
190 0 : dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
191 0 : }
192 : assert(dp == NULL || dp == endp);
193 0 : if (dp != NULL) /* found a shorter one */
194 0 : break;
195 :
196 : /* despite initial appearances, there is no match here */
197 : NOTE("false alarm");
198 0 : start = m->coldp + 1; /* recycle starting later */
199 : assert(start <= stop);
200 0 : }
201 :
202 : /* fill in the details if requested */
203 0 : if (nmatch > 0) {
204 0 : pmatch[0].rm_so = m->coldp - m->offp;
205 0 : pmatch[0].rm_eo = endp - m->offp;
206 : }
207 0 : if (nmatch > 1) {
208 : assert(m->pmatch != NULL);
209 0 : for (i = 1; i < nmatch; i++)
210 0 : if (i <= m->g->nsub)
211 0 : pmatch[i] = m->pmatch[i];
212 : else {
213 0 : pmatch[i].rm_so = -1;
214 0 : pmatch[i].rm_eo = -1;
215 : }
216 : }
217 :
218 0 : if (m->pmatch != NULL)
219 0 : free((char *)m->pmatch);
220 0 : if (m->lastpos != NULL)
221 0 : free((char *)m->lastpos);
222 0 : STATETEARDOWN(m);
223 0 : return(0);
224 : }
225 :
226 : /*
227 : - dissect - figure out what matched what, no back references
228 : == static unsigned char *dissect(register struct match *m, unsigned char *start, \
229 : == unsigned char *stop, sopno startst, sopno stopst);
230 : */
231 : static unsigned char * /* == stop (success) always */
232 : dissect(m, start, stop, startst, stopst)
233 : register struct match *m;
234 : unsigned char *start;
235 : unsigned char *stop;
236 : sopno startst;
237 : sopno stopst;
238 0 : {
239 : register int i;
240 : register sopno ss; /* start sop of current subRE */
241 : register sopno es; /* end sop of current subRE */
242 : register unsigned char *sp; /* start of string matched by it */
243 : register unsigned char *stp; /* string matched by it cannot pass here */
244 : register unsigned char *rest; /* start of rest of string */
245 : register unsigned char *tail; /* string unmatched by rest of RE */
246 : register sopno ssub; /* start sop of subsubRE */
247 : register sopno esub; /* end sop of subsubRE */
248 : register unsigned char *ssp; /* start of string matched by subsubRE */
249 : register unsigned char *sep; /* end of string matched by subsubRE */
250 : register unsigned char *oldssp; /* previous ssp */
251 : register unsigned char *dp;
252 :
253 : AT("diss", start, stop, startst, stopst);
254 0 : sp = start;
255 0 : for (ss = startst; ss < stopst; ss = es) {
256 : /* identify end of subRE */
257 0 : es = ss;
258 0 : switch (OP(m->g->strip[es])) {
259 : case OPLUS_:
260 : case OQUEST_:
261 0 : es += OPND(m->g->strip[es]);
262 0 : break;
263 : case OCH_:
264 0 : while (OP(m->g->strip[es]) != O_CH)
265 0 : es += OPND(m->g->strip[es]);
266 : break;
267 : }
268 0 : es++;
269 :
270 : /* figure out what it matched */
271 0 : switch (OP(m->g->strip[ss])) {
272 : case OEND:
273 : assert(PHP_REGEX_NOPE);
274 0 : break;
275 : case OCHAR:
276 0 : sp++;
277 0 : break;
278 : case OBOL:
279 : case OEOL:
280 : case OBOW:
281 : case OEOW:
282 0 : break;
283 : case OANY:
284 : case OANYOF:
285 0 : sp++;
286 0 : break;
287 : case OBACK_:
288 : case O_BACK:
289 : assert(PHP_REGEX_NOPE);
290 0 : break;
291 : /* cases where length of match is hard to find */
292 : case OQUEST_:
293 0 : stp = stop;
294 : for (;;) {
295 : /* how long could this one be? */
296 0 : rest = slow(m, sp, stp, ss, es);
297 : assert(rest != NULL); /* it did match */
298 : /* could the rest match the rest? */
299 0 : tail = slow(m, rest, stop, es, stopst);
300 0 : if (tail == stop)
301 0 : break; /* yes! */
302 : /* no -- try a shorter match for this one */
303 0 : stp = rest - 1;
304 : assert(stp >= sp); /* it did work */
305 0 : }
306 0 : ssub = ss + 1;
307 0 : esub = es - 1;
308 : /* did innards match? */
309 0 : if (slow(m, sp, rest, ssub, esub) != NULL) {
310 0 : dp = dissect(m, sp, rest, ssub, esub);
311 : assert(dp == rest);
312 : } else /* no */
313 : assert(sp == rest);
314 0 : sp = rest;
315 0 : break;
316 : case OPLUS_:
317 0 : stp = stop;
318 : for (;;) {
319 : /* how long could this one be? */
320 0 : rest = slow(m, sp, stp, ss, es);
321 : assert(rest != NULL); /* it did match */
322 : /* could the rest match the rest? */
323 0 : tail = slow(m, rest, stop, es, stopst);
324 0 : if (tail == stop)
325 0 : break; /* yes! */
326 : /* no -- try a shorter match for this one */
327 0 : stp = rest - 1;
328 : assert(stp >= sp); /* it did work */
329 0 : }
330 0 : ssub = ss + 1;
331 0 : esub = es - 1;
332 0 : ssp = sp;
333 0 : oldssp = ssp;
334 : for (;;) { /* find last match of innards */
335 0 : sep = slow(m, ssp, rest, ssub, esub);
336 0 : if (sep == NULL || sep == ssp)
337 : break; /* failed or matched null */
338 0 : oldssp = ssp; /* on to next try */
339 0 : ssp = sep;
340 0 : }
341 0 : if (sep == NULL) {
342 : /* last successful match */
343 0 : sep = ssp;
344 0 : ssp = oldssp;
345 : }
346 : assert(sep == rest); /* must exhaust substring */
347 : assert(slow(m, ssp, sep, ssub, esub) == rest);
348 0 : dp = dissect(m, ssp, sep, ssub, esub);
349 : assert(dp == sep);
350 0 : sp = rest;
351 0 : break;
352 : case OCH_:
353 0 : stp = stop;
354 : for (;;) {
355 : /* how long could this one be? */
356 0 : rest = slow(m, sp, stp, ss, es);
357 : assert(rest != NULL); /* it did match */
358 : /* could the rest match the rest? */
359 0 : tail = slow(m, rest, stop, es, stopst);
360 0 : if (tail == stop)
361 0 : break; /* yes! */
362 : /* no -- try a shorter match for this one */
363 0 : stp = rest - 1;
364 : assert(stp >= sp); /* it did work */
365 0 : }
366 0 : ssub = ss + 1;
367 0 : esub = ss + OPND(m->g->strip[ss]) - 1;
368 : assert(OP(m->g->strip[esub]) == OOR1);
369 : for (;;) { /* find first matching branch */
370 0 : if (slow(m, sp, rest, ssub, esub) == rest)
371 0 : break; /* it matched all of it */
372 : /* that one missed, try next one */
373 : assert(OP(m->g->strip[esub]) == OOR1);
374 0 : esub++;
375 : assert(OP(m->g->strip[esub]) == OOR2);
376 0 : ssub = esub + 1;
377 0 : esub += OPND(m->g->strip[esub]);
378 0 : if (OP(m->g->strip[esub]) == OOR2)
379 0 : esub--;
380 : else
381 : assert(OP(m->g->strip[esub]) == O_CH);
382 0 : }
383 0 : dp = dissect(m, sp, rest, ssub, esub);
384 : assert(dp == rest);
385 0 : sp = rest;
386 0 : break;
387 : case O_PLUS:
388 : case O_QUEST:
389 : case OOR1:
390 : case OOR2:
391 : case O_CH:
392 : assert(PHP_REGEX_NOPE);
393 0 : break;
394 : case OLPAREN:
395 0 : i = OPND(m->g->strip[ss]);
396 : assert(0 < i && i <= m->g->nsub);
397 0 : m->pmatch[i].rm_so = sp - m->offp;
398 0 : break;
399 : case ORPAREN:
400 0 : i = OPND(m->g->strip[ss]);
401 : assert(0 < i && i <= m->g->nsub);
402 0 : m->pmatch[i].rm_eo = sp - m->offp;
403 : break;
404 : default: /* uh oh */
405 : assert(PHP_REGEX_NOPE);
406 : break;
407 : }
408 : }
409 :
410 : assert(sp == stop);
411 0 : return(sp);
412 : }
413 :
414 : /*
415 : - backref - figure out what matched what, figuring in back references
416 : == static unsigned char *backref(register struct match *m, unsigned char *start, \
417 : == unsigned char *stop, sopno startst, sopno stopst, sopno lev);
418 : */
419 : static unsigned char * /* == stop (success) or NULL (failure) */
420 : backref(m, start, stop, startst, stopst, lev)
421 : register struct match *m;
422 : unsigned char *start;
423 : unsigned char *stop;
424 : sopno startst;
425 : sopno stopst;
426 : sopno lev; /* PLUS nesting level */
427 0 : {
428 : register int i;
429 : register sopno ss; /* start sop of current subRE */
430 : register unsigned char *sp; /* start of string matched by it */
431 : register sopno ssub; /* start sop of subsubRE */
432 : register sopno esub; /* end sop of subsubRE */
433 : register unsigned char *ssp; /* start of string matched by subsubRE */
434 : register unsigned char *dp;
435 : register size_t len;
436 : register int hard;
437 : register sop s;
438 : register regoff_t offsave;
439 : register cset *cs;
440 :
441 : AT("back", start, stop, startst, stopst);
442 0 : sp = start;
443 :
444 : /* get as far as we can with easy stuff */
445 0 : hard = 0;
446 0 : for (ss = startst; !hard && ss < stopst; ss++)
447 0 : switch (OP(s = m->g->strip[ss])) {
448 : case OCHAR:
449 0 : if (sp == stop || *sp++ != (unsigned char)OPND(s))
450 0 : return(NULL);
451 0 : break;
452 : case OANY:
453 0 : if (sp == stop)
454 0 : return(NULL);
455 0 : sp++;
456 0 : break;
457 : case OANYOF:
458 0 : cs = &m->g->sets[OPND(s)];
459 0 : if (sp == stop || !CHIN(cs, *sp++))
460 0 : return(NULL);
461 0 : break;
462 : case OBOL:
463 0 : if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
464 : (sp < m->endp && *(sp-1) == '\n' &&
465 : (m->g->cflags®_NEWLINE)) )
466 : { /* yes */ }
467 : else
468 0 : return(NULL);
469 0 : break;
470 : case OEOL:
471 0 : if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
472 : (sp < m->endp && *sp == '\n' &&
473 : (m->g->cflags®_NEWLINE)) )
474 : { /* yes */ }
475 : else
476 0 : return(NULL);
477 0 : break;
478 : case OBOW:
479 0 : if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
480 : (sp < m->endp && *(sp-1) == '\n' &&
481 : (m->g->cflags®_NEWLINE)) ||
482 : (sp > m->beginp &&
483 : !ISWORD(*(sp-1))) ) &&
484 : (sp < m->endp && ISWORD(*sp)) )
485 : { /* yes */ }
486 : else
487 0 : return(NULL);
488 0 : break;
489 : case OEOW:
490 0 : if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
491 : (sp < m->endp && *sp == '\n' &&
492 : (m->g->cflags®_NEWLINE)) ||
493 : (sp < m->endp && !ISWORD(*sp)) ) &&
494 : (sp > m->beginp && ISWORD(*(sp-1))) )
495 : { /* yes */ }
496 : else
497 0 : return(NULL);
498 0 : break;
499 : case O_QUEST:
500 0 : break;
501 : case OOR1: /* matches null but needs to skip */
502 0 : ss++;
503 0 : s = m->g->strip[ss];
504 : do {
505 : assert(OP(s) == OOR2);
506 0 : ss += OPND(s);
507 0 : } while (OP(s = m->g->strip[ss]) != O_CH);
508 : /* note that the ss++ gets us past the O_CH */
509 0 : break;
510 : default: /* have to make a choice */
511 0 : hard = 1;
512 : break;
513 : }
514 0 : if (!hard) { /* that was it! */
515 0 : if (sp != stop)
516 0 : return(NULL);
517 0 : return(sp);
518 : }
519 0 : ss--; /* adjust for the for's final increment */
520 :
521 : /* the hard stuff */
522 : AT("hard", sp, stop, ss, stopst);
523 0 : s = m->g->strip[ss];
524 0 : switch (OP(s)) {
525 : case OBACK_: /* the vilest depths */
526 0 : i = OPND(s);
527 : assert(0 < i && i <= m->g->nsub);
528 0 : if (m->pmatch[i].rm_eo == -1)
529 0 : return(NULL);
530 : assert(m->pmatch[i].rm_so != -1);
531 0 : len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
532 : assert(stop - m->beginp >= len);
533 0 : if (sp > stop - len)
534 0 : return(NULL); /* not enough left to match */
535 0 : ssp = m->offp + m->pmatch[i].rm_so;
536 0 : if (memcmp(sp, ssp, len) != 0)
537 0 : return(NULL);
538 0 : while (m->g->strip[ss] != SOP(O_BACK, i))
539 0 : ss++;
540 0 : return(backref(m, sp+len, stop, ss+1, stopst, lev));
541 : break;
542 : case OQUEST_: /* to null or not */
543 0 : dp = backref(m, sp, stop, ss+1, stopst, lev);
544 0 : if (dp != NULL)
545 0 : return(dp); /* not */
546 0 : return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
547 : break;
548 : case OPLUS_:
549 : assert(m->lastpos != NULL);
550 : assert(lev+1 <= m->g->nplus);
551 0 : m->lastpos[lev+1] = sp;
552 0 : return(backref(m, sp, stop, ss+1, stopst, lev+1));
553 : break;
554 : case O_PLUS:
555 0 : if (sp == m->lastpos[lev]) /* last pass matched null */
556 0 : return(backref(m, sp, stop, ss+1, stopst, lev-1));
557 : /* try another pass */
558 0 : m->lastpos[lev] = sp;
559 0 : dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
560 0 : if (dp == NULL)
561 0 : return(backref(m, sp, stop, ss+1, stopst, lev-1));
562 : else
563 0 : return(dp);
564 : break;
565 : case OCH_: /* find the right one, if any */
566 0 : ssub = ss + 1;
567 0 : esub = ss + OPND(s) - 1;
568 : assert(OP(m->g->strip[esub]) == OOR1);
569 : for (;;) { /* find first matching branch */
570 0 : dp = backref(m, sp, stop, ssub, esub, lev);
571 0 : if (dp != NULL)
572 0 : return(dp);
573 : /* that one missed, try next one */
574 0 : if (OP(m->g->strip[esub]) == O_CH)
575 0 : return(NULL); /* there is none */
576 0 : esub++;
577 : assert(OP(m->g->strip[esub]) == OOR2);
578 0 : ssub = esub + 1;
579 0 : esub += OPND(m->g->strip[esub]);
580 0 : if (OP(m->g->strip[esub]) == OOR2)
581 0 : esub--;
582 : else
583 : assert(OP(m->g->strip[esub]) == O_CH);
584 0 : }
585 : break;
586 : case OLPAREN: /* must undo assignment if rest fails */
587 0 : i = OPND(s);
588 : assert(0 < i && i <= m->g->nsub);
589 0 : offsave = m->pmatch[i].rm_so;
590 0 : m->pmatch[i].rm_so = sp - m->offp;
591 0 : dp = backref(m, sp, stop, ss+1, stopst, lev);
592 0 : if (dp != NULL)
593 0 : return(dp);
594 0 : m->pmatch[i].rm_so = offsave;
595 0 : return(NULL);
596 : break;
597 : case ORPAREN: /* must undo assignment if rest fails */
598 0 : i = OPND(s);
599 : assert(0 < i && i <= m->g->nsub);
600 0 : offsave = m->pmatch[i].rm_eo;
601 0 : m->pmatch[i].rm_eo = sp - m->offp;
602 0 : dp = backref(m, sp, stop, ss+1, stopst, lev);
603 0 : if (dp != NULL)
604 0 : return(dp);
605 0 : m->pmatch[i].rm_eo = offsave;
606 0 : return(NULL);
607 : break;
608 : default: /* uh oh */
609 : assert(PHP_REGEX_NOPE);
610 : break;
611 : }
612 :
613 : /* "can't happen" */
614 : assert(PHP_REGEX_NOPE);
615 : /* NOTREACHED */
616 0 : return((unsigned char *)NULL); /* dummy */
617 : }
618 :
619 : /*
620 : - fast - step through the string at top speed
621 : == static unsigned char *fast(register struct match *m, unsigned char *start, \
622 : == unsigned char *stop, sopno startst, sopno stopst);
623 : */
624 : static unsigned char * /* where tentative match ended, or NULL */
625 : fast(m, start, stop, startst, stopst)
626 : register struct match *m;
627 : unsigned char *start;
628 : unsigned char *stop;
629 : sopno startst;
630 : sopno stopst;
631 0 : {
632 0 : register states st = m->st;
633 0 : register states fresh = m->fresh;
634 0 : register states tmp = m->tmp;
635 0 : register unsigned char *p = start;
636 0 : register int c = (start == m->beginp) ? OUT : *(start-1);
637 : register int lastc; /* previous c */
638 : register int flagch;
639 : register int i;
640 : register unsigned char *coldp; /* last p after which no match was underway */
641 :
642 0 : CLEAR(st);
643 0 : SET1(st, startst);
644 0 : st = step(m->g, startst, stopst, st, NOTHING, st);
645 0 : ASSIGN(fresh, st);
646 : SP("start", st, *p);
647 0 : coldp = NULL;
648 : for (;;) {
649 : /* next character */
650 0 : lastc = c;
651 0 : c = (p == m->endp) ? OUT : *p;
652 0 : if (EQ(st, fresh))
653 0 : coldp = p;
654 :
655 : /* is there an EOL and/or BOL between lastc and c? */
656 0 : flagch = '\0';
657 0 : i = 0;
658 0 : if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
659 : (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
660 0 : flagch = BOL;
661 0 : i = m->g->nbol;
662 : }
663 0 : if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
664 : (c == OUT && !(m->eflags®_NOTEOL)) ) {
665 0 : flagch = (flagch == BOL) ? BOLEOL : EOL;
666 0 : i += m->g->neol;
667 : }
668 0 : if (i != 0) {
669 0 : for (; i > 0; i--)
670 0 : st = step(m->g, startst, stopst, st, flagch, st);
671 : SP("boleol", st, c);
672 : }
673 :
674 : /* how about a word boundary? */
675 0 : if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
676 : (c != OUT && ISWORD(c)) ) {
677 0 : flagch = BOW;
678 : }
679 0 : if ( (lastc != OUT && ISWORD(lastc)) &&
680 : (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
681 0 : flagch = EOW;
682 : }
683 0 : if (flagch == BOW || flagch == EOW) {
684 0 : st = step(m->g, startst, stopst, st, flagch, st);
685 : SP("boweow", st, c);
686 : }
687 :
688 : /* are we done? */
689 0 : if (ISSET(st, stopst) || p == stop)
690 : break; /* NOTE BREAK OUT */
691 :
692 : /* no, we must deal with this character */
693 0 : ASSIGN(tmp, st);
694 0 : ASSIGN(st, fresh);
695 : assert(c != OUT);
696 0 : st = step(m->g, startst, stopst, tmp, c, st);
697 : SP("aft", st, c);
698 : assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
699 0 : p++;
700 0 : }
701 :
702 : assert(coldp != NULL);
703 0 : m->coldp = coldp;
704 0 : if (ISSET(st, stopst))
705 0 : return(p+1);
706 : else
707 0 : return(NULL);
708 : }
709 :
710 : /*
711 : - slow - step through the string more deliberately
712 : == static unsigned char *slow(register struct match *m, unsigned char *start, \
713 : == unsigned char *stop, sopno startst, sopno stopst);
714 : */
715 : static unsigned char * /* where it ended */
716 : slow(m, start, stop, startst, stopst)
717 : register struct match *m;
718 : unsigned char *start;
719 : unsigned char *stop;
720 : sopno startst;
721 : sopno stopst;
722 0 : {
723 0 : register states st = m->st;
724 0 : register states empty = m->empty;
725 0 : register states tmp = m->tmp;
726 0 : register unsigned char *p = start;
727 0 : register int c = (start == m->beginp) ? OUT : *(start-1);
728 : register int lastc; /* previous c */
729 : register int flagch;
730 : register int i;
731 : register unsigned char *matchp; /* last p at which a match ended */
732 :
733 : AT("slow", start, stop, startst, stopst);
734 0 : CLEAR(st);
735 0 : SET1(st, startst);
736 : SP("sstart", st, *p);
737 0 : st = step(m->g, startst, stopst, st, NOTHING, st);
738 0 : matchp = NULL;
739 : for (;;) {
740 : /* next character */
741 0 : lastc = c;
742 0 : c = (p == m->endp) ? OUT : *p;
743 :
744 : /* is there an EOL and/or BOL between lastc and c? */
745 0 : flagch = '\0';
746 0 : i = 0;
747 0 : if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
748 : (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
749 0 : flagch = BOL;
750 0 : i = m->g->nbol;
751 : }
752 0 : if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
753 : (c == OUT && !(m->eflags®_NOTEOL)) ) {
754 0 : flagch = (flagch == BOL) ? BOLEOL : EOL;
755 0 : i += m->g->neol;
756 : }
757 0 : if (i != 0) {
758 0 : for (; i > 0; i--)
759 0 : st = step(m->g, startst, stopst, st, flagch, st);
760 : SP("sboleol", st, c);
761 : }
762 :
763 : /* how about a word boundary? */
764 0 : if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
765 : (c != OUT && ISWORD(c)) ) {
766 0 : flagch = BOW;
767 : }
768 0 : if ( (lastc != OUT && ISWORD(lastc)) &&
769 : (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
770 0 : flagch = EOW;
771 : }
772 0 : if (flagch == BOW || flagch == EOW) {
773 0 : st = step(m->g, startst, stopst, st, flagch, st);
774 : SP("sboweow", st, c);
775 : }
776 :
777 : /* are we done? */
778 0 : if (ISSET(st, stopst))
779 0 : matchp = p;
780 0 : if (EQ(st, empty) || p == stop)
781 : break; /* NOTE BREAK OUT */
782 :
783 : /* no, we must deal with this character */
784 0 : ASSIGN(tmp, st);
785 0 : ASSIGN(st, empty);
786 : assert(c != OUT);
787 0 : st = step(m->g, startst, stopst, tmp, c, st);
788 : SP("saft", st, c);
789 : assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
790 0 : p++;
791 0 : }
792 :
793 0 : return(matchp);
794 : }
795 :
796 :
797 : /*
798 : - step - map set of states reachable before char to set reachable after
799 : == static states step(register struct re_guts *g, sopno start, sopno stop, \
800 : == register states bef, int ch, register states aft);
801 : == #define BOL (OUT+1)
802 : == #define EOL (BOL+1)
803 : == #define BOLEOL (BOL+2)
804 : == #define NOTHING (BOL+3)
805 : == #define BOW (BOL+4)
806 : == #define EOW (BOL+5)
807 : == #define CODEMAX (BOL+5) // highest code used
808 : == #define NONCHAR(c) ((c) > UCHAR_MAX)
809 : == #define NNONCHAR (CODEMAX-UCHAR_MAX)
810 : */
811 : static states
812 : step(g, start, stop, bef, ch, aft)
813 : register struct re_guts *g;
814 : sopno start; /* start state within strip */
815 : sopno stop; /* state after stop state within strip */
816 : register states bef; /* states reachable before */
817 : int ch; /* character or NONCHAR code */
818 : register states aft; /* states already known reachable after */
819 0 : {
820 : register cset *cs;
821 : register sop s;
822 : register sopno pc;
823 : register onestate here; /* note, macros know this name */
824 : register sopno look;
825 : register long i;
826 :
827 0 : for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
828 0 : s = g->strip[pc];
829 0 : switch (OP(s)) {
830 : case OEND:
831 : assert(pc == stop-1);
832 0 : break;
833 : case OCHAR:
834 : /* only characters can match */
835 : assert(!NONCHAR(ch) || ch != (unsigned char)OPND(s));
836 0 : if (ch == (unsigned char)OPND(s))
837 0 : FWD(aft, bef, 1);
838 0 : break;
839 : case OBOL:
840 0 : if (ch == BOL || ch == BOLEOL)
841 0 : FWD(aft, bef, 1);
842 0 : break;
843 : case OEOL:
844 0 : if (ch == EOL || ch == BOLEOL)
845 0 : FWD(aft, bef, 1);
846 0 : break;
847 : case OBOW:
848 0 : if (ch == BOW)
849 0 : FWD(aft, bef, 1);
850 0 : break;
851 : case OEOW:
852 0 : if (ch == EOW)
853 0 : FWD(aft, bef, 1);
854 0 : break;
855 : case OANY:
856 0 : if (!NONCHAR(ch))
857 0 : FWD(aft, bef, 1);
858 0 : break;
859 : case OANYOF:
860 0 : cs = &g->sets[OPND(s)];
861 0 : if (!NONCHAR(ch) && CHIN(cs, ch))
862 0 : FWD(aft, bef, 1);
863 0 : break;
864 : case OBACK_: /* ignored here */
865 : case O_BACK:
866 0 : FWD(aft, aft, 1);
867 0 : break;
868 : case OPLUS_: /* forward, this is just an empty */
869 0 : FWD(aft, aft, 1);
870 0 : break;
871 : case O_PLUS: /* both forward and back */
872 0 : FWD(aft, aft, 1);
873 0 : i = ISSETBACK(aft, OPND(s));
874 0 : BACK(aft, aft, OPND(s));
875 0 : if (!i && ISSETBACK(aft, OPND(s))) {
876 : /* oho, must reconsider loop body */
877 0 : pc -= OPND(s) + 1;
878 0 : INIT(here, pc);
879 : }
880 0 : break;
881 : case OQUEST_: /* two branches, both forward */
882 0 : FWD(aft, aft, 1);
883 0 : FWD(aft, aft, OPND(s));
884 0 : break;
885 : case O_QUEST: /* just an empty */
886 0 : FWD(aft, aft, 1);
887 0 : break;
888 : case OLPAREN: /* not significant here */
889 : case ORPAREN:
890 0 : FWD(aft, aft, 1);
891 0 : break;
892 : case OCH_: /* mark the first two branches */
893 0 : FWD(aft, aft, 1);
894 : assert(OP(g->strip[pc+OPND(s)]) == OOR2);
895 0 : FWD(aft, aft, OPND(s));
896 0 : break;
897 : case OOR1: /* done a branch, find the O_CH */
898 0 : if (ISSTATEIN(aft, here)) {
899 0 : for (look = 1;
900 0 : OP(s = g->strip[pc+look]) != O_CH;
901 0 : look += OPND(s))
902 : assert(OP(s) == OOR2);
903 0 : FWD(aft, aft, look);
904 : }
905 0 : break;
906 : case OOR2: /* propagate OCH_'s marking */
907 0 : FWD(aft, aft, 1);
908 0 : if (OP(g->strip[pc+OPND(s)]) != O_CH) {
909 : assert(OP(g->strip[pc+OPND(s)]) == OOR2);
910 0 : FWD(aft, aft, OPND(s));
911 : }
912 0 : break;
913 : case O_CH: /* just empty */
914 0 : FWD(aft, aft, 1);
915 : break;
916 : default: /* ooooops... */
917 : assert(PHP_REGEX_NOPE);
918 : break;
919 : }
920 : }
921 :
922 0 : return(aft);
923 : }
924 :
925 : #ifdef REDEBUG
926 : /*
927 : - print - print a set of states
928 : == #ifdef REDEBUG
929 : == static void print(struct match *m, unsigned char *caption, states st, \
930 : == int ch, FILE *d);
931 : == #endif
932 : */
933 : static void
934 : print(m, caption, st, ch, d)
935 : struct match *m;
936 : unsigned char *caption;
937 : states st;
938 : int ch;
939 : FILE *d;
940 : {
941 : register struct re_guts *g = m->g;
942 : register int i;
943 : register int first = 1;
944 :
945 : if (!(m->eflags®_TRACE))
946 : return;
947 :
948 : fprintf(d, "%s", caption);
949 : if (ch != '\0')
950 : fprintf(d, " %s", pchar(ch));
951 : for (i = 0; i < g->nstates; i++)
952 : if (ISSET(st, i)) {
953 : fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
954 : first = 0;
955 : }
956 : fprintf(d, "\n");
957 : }
958 :
959 : /*
960 : - at - print current situation
961 : == #ifdef REDEBUG
962 : == static void at(struct match *m, unsigned char *title, unsigned char *start, unsigned char *stop, \
963 : == sopno startst, sopno stopst);
964 : == #endif
965 : */
966 : static void
967 : at(m, title, start, stop, startst, stopst)
968 : struct match *m;
969 : unsigned char *title;
970 : unsigned char *start;
971 : unsigned char *stop;
972 : sopno startst;
973 : sopno stopst;
974 : {
975 : if (!(m->eflags®_TRACE))
976 : return;
977 :
978 : printf("%s %s-", title, pchar(*start));
979 : printf("%s ", pchar(*stop));
980 : printf("%ld-%ld\n", (long)startst, (long)stopst);
981 : }
982 :
983 : #ifndef PCHARDONE
984 : #define PCHARDONE /* never again */
985 : /*
986 : - pchar - make a character printable
987 : == #ifdef REDEBUG
988 : == static unsigned char *pchar(int ch);
989 : == #endif
990 : *
991 : * Is this identical to regchar() over in debug.c? Well, yes. But a
992 : * duplicate here avoids having a debugging-capable regexec.o tied to
993 : * a matching debug.o, and this is convenient. It all disappears in
994 : * the non-debug compilation anyway, so it doesn't matter much.
995 : */
996 : static unsigned char * /* -> representation */
997 : pchar(ch)
998 : int ch;
999 : {
1000 : static unsigned char pbuf[10];
1001 :
1002 : if (isprint(ch) || ch == ' ')
1003 : sprintf(pbuf, "%c", ch);
1004 : else
1005 : sprintf(pbuf, "\\%o", ch);
1006 : return(pbuf);
1007 : }
1008 : #endif
1009 : #endif
1010 :
1011 : #undef matcher
1012 : #undef fast
1013 : #undef slow
1014 : #undef dissect
1015 : #undef backref
1016 : #undef step
1017 : #undef print
1018 : #undef at
1019 : #undef match
|