1 : /*************************************************
2 : * Perl-Compatible Regular Expressions *
3 : *************************************************/
4 :
5 : /* PCRE is a library of functions to support regular expressions whose syntax
6 : and semantics are as close as possible to those of the Perl 5 language.
7 :
8 : Written by Philip Hazel
9 : Copyright (c) 1997-2006 University of Cambridge
10 :
11 : -----------------------------------------------------------------------------
12 : Redistribution and use in source and binary forms, with or without
13 : modification, are permitted provided that the following conditions are met:
14 :
15 : * Redistributions of source code must retain the above copyright notice,
16 : this list of conditions and the following disclaimer.
17 :
18 : * Redistributions in binary form must reproduce the above copyright
19 : notice, this list of conditions and the following disclaimer in the
20 : documentation and/or other materials provided with the distribution.
21 :
22 : * Neither the name of the University of Cambridge nor the names of its
23 : contributors may be used to endorse or promote products derived from
24 : this software without specific prior written permission.
25 :
26 : THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 : AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 : IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 : ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 : LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 : CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 : SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 : INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 : CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 : ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 : POSSIBILITY OF SUCH DAMAGE.
37 : -----------------------------------------------------------------------------
38 : */
39 :
40 :
41 : /* This module contains pcre_exec(), the externally visible function that does
42 : pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 : possible. There are also some static supporting functions. */
44 :
45 : #define NLBLOCK md /* Block containing newline information */
46 : #define PSSTART start_subject /* Field containing processed string start */
47 : #define PSEND end_subject /* Field containing processed string end */
48 :
49 : #include "pcre_internal.h"
50 :
51 : /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
52 : obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
53 :
54 : #define EPTR_WORK_SIZE (1000)
55 :
56 : /* Flag bits for the match() function */
57 :
58 : #define match_condassert 0x01 /* Called to check a condition assertion */
59 : #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
60 : #define match_tail_recursed 0x04 /* Tail recursive call */
61 :
62 : /* Non-error returns from the match() function. Error returns are externally
63 : defined PCRE_ERROR_xxx codes, which are all negative. */
64 :
65 : #define MATCH_MATCH 1
66 : #define MATCH_NOMATCH 0
67 :
68 : /* Maximum number of ints of offset to save on the stack for recursive calls.
69 : If the offset vector is bigger, malloc is used. This should be a multiple of 3,
70 : because the offset vector is always a multiple of 3 long. */
71 :
72 : #define REC_STACK_SAVE_MAX 30
73 :
74 : /* Min and max values for the common repeats; for the maxima, 0 => infinity */
75 :
76 : static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
77 : static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
78 :
79 :
80 :
81 : #ifdef DEBUG
82 : /*************************************************
83 : * Debugging function to print chars *
84 : *************************************************/
85 :
86 : /* Print a sequence of chars in printable format, stopping at the end of the
87 : subject if the requested.
88 :
89 : Arguments:
90 : p points to characters
91 : length number to print
92 : is_subject TRUE if printing from within md->start_subject
93 : md pointer to matching data block, if is_subject is TRUE
94 :
95 : Returns: nothing
96 : */
97 :
98 : static void
99 : pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
100 : {
101 : unsigned int c;
102 : if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
103 : while (length-- > 0)
104 : if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
105 : }
106 : #endif
107 :
108 :
109 :
110 : /*************************************************
111 : * Match a back-reference *
112 : *************************************************/
113 :
114 : /* If a back reference hasn't been set, the length that is passed is greater
115 : than the number of characters left in the string, so the match fails.
116 :
117 : Arguments:
118 : offset index into the offset vector
119 : eptr points into the subject
120 : length length to be matched
121 : md points to match data block
122 : ims the ims flags
123 :
124 : Returns: TRUE if matched
125 : */
126 :
127 : static BOOL
128 : match_ref(int offset, register USPTR eptr, int length, match_data *md,
129 : unsigned long int ims)
130 0 : {
131 0 : USPTR p = md->start_subject + md->offset_vector[offset];
132 :
133 : #ifdef DEBUG
134 : if (eptr >= md->end_subject)
135 : printf("matching subject <null>");
136 : else
137 : {
138 : printf("matching subject ");
139 : pchars(eptr, length, TRUE, md);
140 : }
141 : printf(" against backref ");
142 : pchars(p, length, FALSE, md);
143 : printf("\n");
144 : #endif
145 :
146 : /* Always fail if not enough characters left */
147 :
148 0 : if (length > md->end_subject - eptr) return FALSE;
149 :
150 : /* Separate the caselesss case for speed */
151 :
152 0 : if ((ims & PCRE_CASELESS) != 0)
153 : {
154 0 : while (length-- > 0)
155 0 : if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
156 : }
157 : else
158 0 : { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
159 :
160 0 : return TRUE;
161 : }
162 :
163 :
164 :
165 : /***************************************************************************
166 : ****************************************************************************
167 : RECURSION IN THE match() FUNCTION
168 :
169 : The match() function is highly recursive, though not every recursive call
170 : increases the recursive depth. Nevertheless, some regular expressions can cause
171 : it to recurse to a great depth. I was writing for Unix, so I just let it call
172 : itself recursively. This uses the stack for saving everything that has to be
173 : saved for a recursive call. On Unix, the stack can be large, and this works
174 : fine.
175 :
176 : It turns out that on some non-Unix-like systems there are problems with
177 : programs that use a lot of stack. (This despite the fact that every last chip
178 : has oodles of memory these days, and techniques for extending the stack have
179 : been known for decades.) So....
180 :
181 : There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
182 : calls by keeping local variables that need to be preserved in blocks of memory
183 : obtained from malloc() instead instead of on the stack. Macros are used to
184 : achieve this so that the actual code doesn't look very different to what it
185 : always used to.
186 : ****************************************************************************
187 : ***************************************************************************/
188 :
189 :
190 : /* These versions of the macros use the stack, as normal. There are debugging
191 : versions and production versions. */
192 :
193 : #ifndef NO_RECURSE
194 : #define REGISTER register
195 : #ifdef DEBUG
196 : #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
197 : { \
198 : printf("match() called in line %d\n", __LINE__); \
199 : rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
200 : printf("to line %d\n", __LINE__); \
201 : }
202 : #define RRETURN(ra) \
203 : { \
204 : printf("match() returned %d from line %d ", ra, __LINE__); \
205 : return ra; \
206 : }
207 : #else
208 : #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
209 : rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
210 : #define RRETURN(ra) return ra
211 : #endif
212 :
213 : #else
214 :
215 :
216 : /* These versions of the macros manage a private stack on the heap. Note
217 : that the rd argument of RMATCH isn't actually used. It's the md argument of
218 : match(), which never changes. */
219 :
220 : #define REGISTER
221 :
222 : #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
223 : {\
224 : heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
225 : if (setjmp(frame->Xwhere) == 0)\
226 : {\
227 : newframe->Xeptr = ra;\
228 : newframe->Xecode = rb;\
229 : newframe->Xoffset_top = rc;\
230 : newframe->Xims = re;\
231 : newframe->Xeptrb = rf;\
232 : newframe->Xflags = rg;\
233 : newframe->Xrdepth = frame->Xrdepth + 1;\
234 : newframe->Xprevframe = frame;\
235 : frame = newframe;\
236 : DPRINTF(("restarting from line %d\n", __LINE__));\
237 : goto HEAP_RECURSE;\
238 : }\
239 : else\
240 : {\
241 : DPRINTF(("longjumped back to line %d\n", __LINE__));\
242 : frame = md->thisframe;\
243 : rx = frame->Xresult;\
244 : }\
245 : }
246 :
247 : #define RRETURN(ra)\
248 : {\
249 : heapframe *newframe = frame;\
250 : frame = newframe->Xprevframe;\
251 : (pcre_stack_free)(newframe);\
252 : if (frame != NULL)\
253 : {\
254 : frame->Xresult = ra;\
255 : md->thisframe = frame;\
256 : longjmp(frame->Xwhere, 1);\
257 : }\
258 : return ra;\
259 : }
260 :
261 :
262 : /* Structure for remembering the local variables in a private frame */
263 :
264 : typedef struct heapframe {
265 : struct heapframe *Xprevframe;
266 :
267 : /* Function arguments that may change */
268 :
269 : const uschar *Xeptr;
270 : const uschar *Xecode;
271 : int Xoffset_top;
272 : long int Xims;
273 : eptrblock *Xeptrb;
274 : int Xflags;
275 : unsigned int Xrdepth;
276 :
277 : /* Function local variables */
278 :
279 : const uschar *Xcallpat;
280 : const uschar *Xcharptr;
281 : const uschar *Xdata;
282 : const uschar *Xnext;
283 : const uschar *Xpp;
284 : const uschar *Xprev;
285 : const uschar *Xsaved_eptr;
286 :
287 : recursion_info Xnew_recursive;
288 :
289 : BOOL Xcur_is_word;
290 : BOOL Xcondition;
291 : BOOL Xprev_is_word;
292 :
293 : unsigned long int Xoriginal_ims;
294 :
295 : #ifdef SUPPORT_UCP
296 : int Xprop_type;
297 : int Xprop_value;
298 : int Xprop_fail_result;
299 : int Xprop_category;
300 : int Xprop_chartype;
301 : int Xprop_script;
302 : #endif
303 :
304 : int Xctype;
305 : unsigned int Xfc;
306 : int Xfi;
307 : int Xlength;
308 : int Xmax;
309 : int Xmin;
310 : int Xnumber;
311 : int Xoffset;
312 : int Xop;
313 : int Xsave_capture_last;
314 : int Xsave_offset1, Xsave_offset2, Xsave_offset3;
315 : int Xstacksave[REC_STACK_SAVE_MAX];
316 :
317 : eptrblock Xnewptrb;
318 :
319 : /* Place to pass back result, and where to jump back to */
320 :
321 : int Xresult;
322 : jmp_buf Xwhere;
323 :
324 : } heapframe;
325 :
326 : #endif
327 :
328 :
329 : /***************************************************************************
330 : ***************************************************************************/
331 :
332 :
333 :
334 : /*************************************************
335 : * Match from current position *
336 : *************************************************/
337 :
338 : /* This function is called recursively in many circumstances. Whenever it
339 : returns a negative (error) response, the outer incarnation must also return the
340 : same response.
341 :
342 : Performance note: It might be tempting to extract commonly used fields from the
343 : md structure (e.g. utf8, end_subject) into individual variables to improve
344 : performance. Tests using gcc on a SPARC disproved this; in the first case, it
345 : made performance worse.
346 :
347 : Arguments:
348 : eptr pointer to current character in subject
349 : ecode pointer to current position in compiled code
350 : offset_top current top pointer
351 : md pointer to "static" info for the match
352 : ims current /i, /m, and /s options
353 : eptrb pointer to chain of blocks containing eptr at start of
354 : brackets - for testing for empty matches
355 : flags can contain
356 : match_condassert - this is an assertion condition
357 : match_cbegroup - this is the start of an unlimited repeat
358 : group that can match an empty string
359 : match_tail_recursed - this is a tail_recursed group
360 : rdepth the recursion depth
361 :
362 : Returns: MATCH_MATCH if matched ) these values are >= 0
363 : MATCH_NOMATCH if failed to match )
364 : a negative PCRE_ERROR_xxx value if aborted by an error condition
365 : (e.g. stopped by repeated call or recursion limit)
366 : */
367 :
368 : static int
369 : match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
370 : int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
371 : int flags, unsigned int rdepth)
372 34700 : {
373 : /* These variables do not need to be preserved over recursion in this function,
374 : so they can be ordinary variables in all cases. Mark some of them with
375 : "register" because they are used a lot in loops. */
376 :
377 : register int rrc; /* Returns from recursive calls */
378 : register int i; /* Used for loops not involving calls to RMATCH() */
379 : register unsigned int c; /* Character values not kept over RMATCH() calls */
380 : register BOOL utf8; /* Local copy of UTF-8 flag for speed */
381 :
382 : BOOL minimize, possessive; /* Quantifier options */
383 :
384 : /* When recursion is not being used, all "local" variables that have to be
385 : preserved over calls to RMATCH() are part of a "frame" which is obtained from
386 : heap storage. Set up the top-level frame here; others are obtained from the
387 : heap whenever RMATCH() does a "recursion". See the macro definitions above. */
388 :
389 : #ifdef NO_RECURSE
390 : heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
391 : frame->Xprevframe = NULL; /* Marks the top level */
392 :
393 : /* Copy in the original argument variables */
394 :
395 : frame->Xeptr = eptr;
396 : frame->Xecode = ecode;
397 : frame->Xoffset_top = offset_top;
398 : frame->Xims = ims;
399 : frame->Xeptrb = eptrb;
400 : frame->Xflags = flags;
401 : frame->Xrdepth = rdepth;
402 :
403 : /* This is where control jumps back to to effect "recursion" */
404 :
405 : HEAP_RECURSE:
406 :
407 : /* Macros make the argument variables come from the current frame */
408 :
409 : #define eptr frame->Xeptr
410 : #define ecode frame->Xecode
411 : #define offset_top frame->Xoffset_top
412 : #define ims frame->Xims
413 : #define eptrb frame->Xeptrb
414 : #define flags frame->Xflags
415 : #define rdepth frame->Xrdepth
416 :
417 : /* Ditto for the local variables */
418 :
419 : #ifdef SUPPORT_UTF8
420 : #define charptr frame->Xcharptr
421 : #endif
422 : #define callpat frame->Xcallpat
423 : #define data frame->Xdata
424 : #define next frame->Xnext
425 : #define pp frame->Xpp
426 : #define prev frame->Xprev
427 : #define saved_eptr frame->Xsaved_eptr
428 :
429 : #define new_recursive frame->Xnew_recursive
430 :
431 : #define cur_is_word frame->Xcur_is_word
432 : #define condition frame->Xcondition
433 : #define prev_is_word frame->Xprev_is_word
434 :
435 : #define original_ims frame->Xoriginal_ims
436 :
437 : #ifdef SUPPORT_UCP
438 : #define prop_type frame->Xprop_type
439 : #define prop_value frame->Xprop_value
440 : #define prop_fail_result frame->Xprop_fail_result
441 : #define prop_category frame->Xprop_category
442 : #define prop_chartype frame->Xprop_chartype
443 : #define prop_script frame->Xprop_script
444 : #endif
445 :
446 : #define ctype frame->Xctype
447 : #define fc frame->Xfc
448 : #define fi frame->Xfi
449 : #define length frame->Xlength
450 : #define max frame->Xmax
451 : #define min frame->Xmin
452 : #define number frame->Xnumber
453 : #define offset frame->Xoffset
454 : #define op frame->Xop
455 : #define save_capture_last frame->Xsave_capture_last
456 : #define save_offset1 frame->Xsave_offset1
457 : #define save_offset2 frame->Xsave_offset2
458 : #define save_offset3 frame->Xsave_offset3
459 : #define stacksave frame->Xstacksave
460 :
461 : #define newptrb frame->Xnewptrb
462 :
463 : /* When recursion is being used, local variables are allocated on the stack and
464 : get preserved during recursion in the normal way. In this environment, fi and
465 : i, and fc and c, can be the same variables. */
466 :
467 : #else /* NO_RECURSE not defined */
468 : #define fi i
469 : #define fc c
470 :
471 :
472 : #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
473 : const uschar *charptr; /* in small blocks of the code. My normal */
474 : #endif /* style of coding would have declared */
475 : const uschar *callpat; /* them within each of those blocks. */
476 : const uschar *data; /* However, in order to accommodate the */
477 : const uschar *next; /* version of this code that uses an */
478 : USPTR pp; /* external "stack" implemented on the */
479 : const uschar *prev; /* heap, it is easier to declare them all */
480 : USPTR saved_eptr; /* here, so the declarations can be cut */
481 : /* out in a block. The only declarations */
482 : recursion_info new_recursive; /* within blocks below are for variables */
483 : /* that do not have to be preserved over */
484 : BOOL cur_is_word; /* a recursive call to RMATCH(). */
485 : BOOL condition;
486 : BOOL prev_is_word;
487 :
488 : unsigned long int original_ims;
489 :
490 : #ifdef SUPPORT_UCP
491 : int prop_type;
492 : int prop_value;
493 : int prop_fail_result;
494 : int prop_category;
495 : int prop_chartype;
496 : int prop_script;
497 : #endif
498 :
499 : int ctype;
500 : int length;
501 : int max;
502 : int min;
503 : int number;
504 : int offset;
505 : int op;
506 : int save_capture_last;
507 : int save_offset1, save_offset2, save_offset3;
508 : int stacksave[REC_STACK_SAVE_MAX];
509 :
510 : eptrblock newptrb;
511 : #endif /* NO_RECURSE */
512 :
513 : /* These statements are here to stop the compiler complaining about unitialized
514 : variables. */
515 :
516 : #ifdef SUPPORT_UCP
517 34700 : prop_value = 0;
518 34700 : prop_fail_result = 0;
519 : #endif
520 :
521 :
522 : /* This label is used for tail recursion, which is used in a few cases even
523 : when NO_RECURSE is not defined, in order to reduce the amount of stack that is
524 : used. Thanks to Ian Taylor for noticing this possibility and sending the
525 : original patch. */
526 :
527 51978 : TAIL_RECURSE:
528 :
529 : /* OK, now we can get on with the real code of the function. Recursive calls
530 : are specified by the macro RMATCH and RRETURN is used to return. When
531 : NO_RECURSE is *not* defined, these just turn into a recursive call to match()
532 : and a "return", respectively (possibly with some debugging if DEBUG is
533 : defined). However, RMATCH isn't like a function call because it's quite a
534 : complicated macro. It has to be used in one particular way. This shouldn't,
535 : however, impact performance when true recursion is being used. */
536 :
537 : /* First check that we haven't called match() too many times, or that we
538 : haven't exceeded the recursive call limit. */
539 :
540 51978 : if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
541 51978 : if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
542 :
543 51978 : original_ims = ims; /* Save for resetting on ')' */
544 :
545 : #ifdef SUPPORT_UTF8
546 51978 : utf8 = md->utf8; /* Local copy of the flag */
547 : #else
548 : utf8 = FALSE;
549 : #endif
550 :
551 : /* At the start of a group with an unlimited repeat that may match an empty
552 : string, the match_cbegroup flag is set. When this is the case, add the current
553 : subject pointer to the chain of such remembered pointers, to be checked when we
554 : hit the closing ket, in order to break infinite loops that match no characters.
555 : When match() is called in other circumstances, don't add to the chain. If this
556 : is a tail recursion, use a block from the workspace, as the one on the stack is
557 : already used. */
558 :
559 51978 : if ((flags & match_cbegroup) != 0)
560 : {
561 : eptrblock *p;
562 0 : if ((flags & match_tail_recursed) != 0)
563 : {
564 0 : if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
565 0 : p = md->eptrchain + md->eptrn++;
566 : }
567 0 : else p = &newptrb;
568 0 : p->epb_saved_eptr = eptr;
569 0 : p->epb_prev = eptrb;
570 0 : eptrb = p;
571 : }
572 :
573 : /* Now start processing the opcodes. */
574 :
575 : for (;;)
576 : {
577 201762 : minimize = possessive = FALSE;
578 201762 : op = *ecode;
579 :
580 : /* For partial matching, remember if we ever hit the end of the subject after
581 : matching at least one subject character. */
582 :
583 201762 : if (md->partial &&
584 : eptr >= md->end_subject &&
585 : eptr > md->start_match)
586 0 : md->hitend = TRUE;
587 :
588 201762 : switch(op)
589 : {
590 : /* Handle a capturing bracket. If there is space in the offset vector, save
591 : the current subject position in the working slot at the top of the vector.
592 : We mustn't change the current values of the data slot, because they may be
593 : set from a previous iteration of this group, and be referred to by a
594 : reference inside the group.
595 :
596 : If the bracket fails to match, we need to restore this value and also the
597 : values of the final offsets, in case they were set by a previous iteration
598 : of the same bracket.
599 :
600 : If there isn't enough space in the offset vector, treat this as if it were
601 : a non-capturing bracket. Don't worry about setting the flag for the error
602 : case here; that is handled in the code for KET. */
603 :
604 : case OP_CBRA:
605 : case OP_SCBRA:
606 512 : number = GET2(ecode, 1+LINK_SIZE);
607 512 : offset = number << 1;
608 :
609 : #ifdef DEBUG
610 : printf("start bracket %d\n", number);
611 : printf("subject=");
612 : pchars(eptr, 16, TRUE, md);
613 : printf("\n");
614 : #endif
615 :
616 512 : if (offset < md->offset_max)
617 : {
618 512 : save_offset1 = md->offset_vector[offset];
619 512 : save_offset2 = md->offset_vector[offset+1];
620 512 : save_offset3 = md->offset_vector[md->offset_end - number];
621 512 : save_capture_last = md->capture_last;
622 :
623 : DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
624 512 : md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
625 :
626 512 : flags = (op == OP_SCBRA)? match_cbegroup : 0;
627 : do
628 : {
629 512 : RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
630 : ims, eptrb, flags);
631 512 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
632 146 : md->capture_last = save_capture_last;
633 146 : ecode += GET(ecode, 1);
634 : }
635 146 : while (*ecode == OP_ALT);
636 :
637 : DPRINTF(("bracket %d failed\n", number));
638 :
639 146 : md->offset_vector[offset] = save_offset1;
640 146 : md->offset_vector[offset+1] = save_offset2;
641 146 : md->offset_vector[md->offset_end - number] = save_offset3;
642 :
643 146 : RRETURN(MATCH_NOMATCH);
644 : }
645 :
646 : /* Insufficient room for saving captured contents. Treat as a non-capturing
647 : bracket. */
648 :
649 : DPRINTF(("insufficient capture room: treat as non-capturing\n"));
650 :
651 : /* Non-capturing bracket. Loop for all the alternatives. When we get to the
652 : final alternative within the brackets, we would return the result of a
653 : recursive call to match() whatever happened. We can reduce stack usage by
654 : turning this into a tail recursion. */
655 :
656 : case OP_BRA:
657 : case OP_SBRA:
658 : DPRINTF(("start non-capturing bracket\n"));
659 17278 : flags = (op >= OP_SBRA)? match_cbegroup : 0;
660 : for (;;)
661 : {
662 17278 : if (ecode[GET(ecode, 1)] != OP_ALT)
663 : {
664 17278 : ecode += _pcre_OP_lengths[*ecode];
665 17278 : flags |= match_tail_recursed;
666 : DPRINTF(("bracket 0 tail recursion\n"));
667 17278 : goto TAIL_RECURSE;
668 : }
669 :
670 : /* For non-final alternatives, continue the loop for a NOMATCH result;
671 : otherwise return. */
672 :
673 0 : RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
674 : eptrb, flags);
675 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 0 : ecode += GET(ecode, 1);
677 0 : }
678 : /* Control never reaches here. */
679 :
680 : /* Conditional group: compilation checked that there are no more than
681 : two branches. If the condition is false, skipping the first branch takes us
682 : past the end if there is only one branch, but that's OK because that is
683 : exactly what going to the ket would do. As there is only one branch to be
684 : obeyed, we can use tail recursion to avoid using another stack frame. */
685 :
686 : case OP_COND:
687 : case OP_SCOND:
688 0 : if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
689 : {
690 0 : offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
691 0 : condition = md->recursive != NULL &&
692 : (offset == RREF_ANY || offset == md->recursive->group_num);
693 0 : ecode += condition? 3 : GET(ecode, 1);
694 : }
695 :
696 0 : else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
697 : {
698 0 : offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
699 0 : condition = offset < offset_top && md->offset_vector[offset] >= 0;
700 0 : ecode += condition? 3 : GET(ecode, 1);
701 : }
702 :
703 0 : else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
704 : {
705 0 : condition = FALSE;
706 0 : ecode += GET(ecode, 1);
707 : }
708 :
709 : /* The condition is an assertion. Call match() to evaluate it - setting
710 : the final argument match_condassert causes it to stop at the end of an
711 : assertion. */
712 :
713 : else
714 : {
715 0 : RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
716 : match_condassert);
717 0 : if (rrc == MATCH_MATCH)
718 : {
719 0 : condition = TRUE;
720 0 : ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
721 0 : while (*ecode == OP_ALT) ecode += GET(ecode, 1);
722 : }
723 0 : else if (rrc != MATCH_NOMATCH)
724 : {
725 0 : RRETURN(rrc); /* Need braces because of following else */
726 : }
727 : else
728 : {
729 0 : condition = FALSE;
730 0 : ecode += GET(ecode, 1);
731 : }
732 : }
733 :
734 : /* We are now at the branch that is to be obeyed. As there is only one,
735 : we can use tail recursion to avoid using another stack frame. If the second
736 : alternative doesn't exist, we can just plough on. */
737 :
738 0 : if (condition || *ecode == OP_ALT)
739 : {
740 0 : ecode += 1 + LINK_SIZE;
741 0 : flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
742 0 : goto TAIL_RECURSE;
743 : }
744 : else
745 : {
746 0 : ecode += 1 + LINK_SIZE;
747 : }
748 0 : break;
749 :
750 :
751 : /* End of the pattern. If we are in a top-level recursion, we should
752 : restore the offsets appropriately and continue from after the call. */
753 :
754 : case OP_END:
755 5025 : if (md->recursive != NULL && md->recursive->group_num == 0)
756 : {
757 0 : recursion_info *rec = md->recursive;
758 : DPRINTF(("End of pattern in a (?0) recursion\n"));
759 0 : md->recursive = rec->prevrec;
760 0 : memmove(md->offset_vector, rec->offset_save,
761 : rec->saved_max * sizeof(int));
762 0 : md->start_match = rec->save_start;
763 0 : ims = original_ims;
764 0 : ecode = rec->after_call;
765 0 : break;
766 : }
767 :
768 : /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
769 : string - backtracking will then try other alternatives, if any. */
770 :
771 5025 : if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
772 5025 : md->end_match_ptr = eptr; /* Record where we ended */
773 5025 : md->end_offset_top = offset_top; /* and how many extracts were taken */
774 5025 : RRETURN(MATCH_MATCH);
775 :
776 : /* Change option settings */
777 :
778 : case OP_OPT:
779 0 : ims = ecode[1];
780 0 : ecode += 2;
781 : DPRINTF(("ims set to %02lx\n", ims));
782 0 : break;
783 :
784 : /* Assertion brackets. Check the alternative branches in turn - the
785 : matching won't pass the KET for an assertion. If any one branch matches,
786 : the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
787 : start of each branch to move the current point backwards, so the code at
788 : this level is identical to the lookahead case. */
789 :
790 : case OP_ASSERT:
791 : case OP_ASSERTBACK:
792 : do
793 : {
794 0 : RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
795 0 : if (rrc == MATCH_MATCH) break;
796 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
797 0 : ecode += GET(ecode, 1);
798 : }
799 0 : while (*ecode == OP_ALT);
800 0 : if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
801 :
802 : /* If checking an assertion for a condition, return MATCH_MATCH. */
803 :
804 0 : if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
805 :
806 : /* Continue from after the assertion, updating the offsets high water
807 : mark, since extracts may have been taken during the assertion. */
808 :
809 0 : do ecode += GET(ecode,1); while (*ecode == OP_ALT);
810 0 : ecode += 1 + LINK_SIZE;
811 0 : offset_top = md->end_offset_top;
812 0 : continue;
813 :
814 : /* Negative assertion: all branches must fail to match */
815 :
816 : case OP_ASSERT_NOT:
817 : case OP_ASSERTBACK_NOT:
818 : do
819 : {
820 0 : RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
821 0 : if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
822 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
823 0 : ecode += GET(ecode,1);
824 : }
825 0 : while (*ecode == OP_ALT);
826 :
827 0 : if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
828 :
829 0 : ecode += 1 + LINK_SIZE;
830 0 : continue;
831 :
832 : /* Move the subject pointer back. This occurs only at the start of
833 : each branch of a lookbehind assertion. If we are too close to the start to
834 : move back, this match function fails. When working with UTF-8 we move
835 : back a number of characters, not bytes. */
836 :
837 : case OP_REVERSE:
838 : #ifdef SUPPORT_UTF8
839 0 : if (utf8)
840 : {
841 0 : i = GET(ecode, 1);
842 0 : while (i-- > 0)
843 : {
844 0 : eptr--;
845 0 : if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
846 0 : BACKCHAR(eptr)
847 : }
848 : }
849 : else
850 : #endif
851 :
852 : /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
853 :
854 : {
855 0 : eptr -= GET(ecode, 1);
856 0 : if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
857 : }
858 :
859 : /* Skip to next op code */
860 :
861 0 : ecode += 1 + LINK_SIZE;
862 0 : break;
863 :
864 : /* The callout item calls an external function, if one is provided, passing
865 : details of the match so far. This is mainly for debugging, though the
866 : function is able to force a failure. */
867 :
868 : case OP_CALLOUT:
869 0 : if (pcre_callout != NULL)
870 : {
871 : pcre_callout_block cb;
872 0 : cb.version = 1; /* Version 1 of the callout block */
873 0 : cb.callout_number = ecode[1];
874 0 : cb.offset_vector = md->offset_vector;
875 0 : cb.subject = (PCRE_SPTR)md->start_subject;
876 0 : cb.subject_length = md->end_subject - md->start_subject;
877 0 : cb.start_match = md->start_match - md->start_subject;
878 0 : cb.current_position = eptr - md->start_subject;
879 0 : cb.pattern_position = GET(ecode, 2);
880 0 : cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
881 0 : cb.capture_top = offset_top/2;
882 0 : cb.capture_last = md->capture_last;
883 0 : cb.callout_data = md->callout_data;
884 0 : if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
885 0 : if (rrc < 0) RRETURN(rrc);
886 : }
887 0 : ecode += 2 + 2*LINK_SIZE;
888 0 : break;
889 :
890 : /* Recursion either matches the current regex, or some subexpression. The
891 : offset data is the offset to the starting bracket from the start of the
892 : whole pattern. (This is so that it works from duplicated subpatterns.)
893 :
894 : If there are any capturing brackets started but not finished, we have to
895 : save their starting points and reinstate them after the recursion. However,
896 : we don't know how many such there are (offset_top records the completed
897 : total) so we just have to save all the potential data. There may be up to
898 : 65535 such values, which is too large to put on the stack, but using malloc
899 : for small numbers seems expensive. As a compromise, the stack is used when
900 : there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
901 : is used. A problem is what to do if the malloc fails ... there is no way of
902 : returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
903 : values on the stack, and accept that the rest may be wrong.
904 :
905 : There are also other values that have to be saved. We use a chained
906 : sequence of blocks that actually live on the stack. Thanks to Robin Houston
907 : for the original version of this logic. */
908 :
909 : case OP_RECURSE:
910 : {
911 0 : callpat = md->start_code + GET(ecode, 1);
912 0 : new_recursive.group_num = (callpat == md->start_code)? 0 :
913 : GET2(callpat, 1 + LINK_SIZE);
914 :
915 : /* Add to "recursing stack" */
916 :
917 0 : new_recursive.prevrec = md->recursive;
918 0 : md->recursive = &new_recursive;
919 :
920 : /* Find where to continue from afterwards */
921 :
922 0 : ecode += 1 + LINK_SIZE;
923 0 : new_recursive.after_call = ecode;
924 :
925 : /* Now save the offset data. */
926 :
927 0 : new_recursive.saved_max = md->offset_end;
928 0 : if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
929 0 : new_recursive.offset_save = stacksave;
930 : else
931 : {
932 0 : new_recursive.offset_save =
933 : (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
934 0 : if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
935 : }
936 :
937 0 : memcpy(new_recursive.offset_save, md->offset_vector,
938 : new_recursive.saved_max * sizeof(int));
939 0 : new_recursive.save_start = md->start_match;
940 0 : md->start_match = eptr;
941 :
942 : /* OK, now we can do the recursion. For each top-level alternative we
943 : restore the offset and recursion data. */
944 :
945 : DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
946 0 : flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
947 : do
948 : {
949 0 : RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
950 : md, ims, eptrb, flags);
951 0 : if (rrc == MATCH_MATCH)
952 : {
953 : DPRINTF(("Recursion matched\n"));
954 0 : md->recursive = new_recursive.prevrec;
955 0 : if (new_recursive.offset_save != stacksave)
956 0 : (pcre_free)(new_recursive.offset_save);
957 0 : RRETURN(MATCH_MATCH);
958 : }
959 0 : else if (rrc != MATCH_NOMATCH)
960 : {
961 : DPRINTF(("Recursion gave error %d\n", rrc));
962 0 : RRETURN(rrc);
963 : }
964 :
965 0 : md->recursive = &new_recursive;
966 0 : memcpy(md->offset_vector, new_recursive.offset_save,
967 : new_recursive.saved_max * sizeof(int));
968 0 : callpat += GET(callpat, 1);
969 : }
970 0 : while (*callpat == OP_ALT);
971 :
972 : DPRINTF(("Recursion didn't match\n"));
973 0 : md->recursive = new_recursive.prevrec;
974 0 : if (new_recursive.offset_save != stacksave)
975 0 : (pcre_free)(new_recursive.offset_save);
976 0 : RRETURN(MATCH_NOMATCH);
977 : }
978 : /* Control never reaches here */
979 :
980 : /* "Once" brackets are like assertion brackets except that after a match,
981 : the point in the subject string is not moved back. Thus there can never be
982 : a move back into the brackets. Friedl calls these "atomic" subpatterns.
983 : Check the alternative branches in turn - the matching won't pass the KET
984 : for this kind of subpattern. If any one branch matches, we carry on as at
985 : the end of a normal bracket, leaving the subject pointer. */
986 :
987 : case OP_ONCE:
988 0 : prev = ecode;
989 0 : saved_eptr = eptr;
990 :
991 : do
992 : {
993 0 : RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
994 : eptrb, 0);
995 0 : if (rrc == MATCH_MATCH) break;
996 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
997 0 : ecode += GET(ecode,1);
998 : }
999 0 : while (*ecode == OP_ALT);
1000 :
1001 : /* If hit the end of the group (which could be repeated), fail */
1002 :
1003 0 : if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1004 :
1005 : /* Continue as from after the assertion, updating the offsets high water
1006 : mark, since extracts may have been taken. */
1007 :
1008 0 : do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1009 :
1010 0 : offset_top = md->end_offset_top;
1011 0 : eptr = md->end_match_ptr;
1012 :
1013 : /* For a non-repeating ket, just continue at this level. This also
1014 : happens for a repeating ket if no characters were matched in the group.
1015 : This is the forcible breaking of infinite loops as implemented in Perl
1016 : 5.005. If there is an options reset, it will get obeyed in the normal
1017 : course of events. */
1018 :
1019 0 : if (*ecode == OP_KET || eptr == saved_eptr)
1020 : {
1021 0 : ecode += 1+LINK_SIZE;
1022 0 : break;
1023 : }
1024 :
1025 : /* The repeating kets try the rest of the pattern or restart from the
1026 : preceding bracket, in the appropriate order. The second "call" of match()
1027 : uses tail recursion, to avoid using another stack frame. We need to reset
1028 : any options that changed within the bracket before re-running it, so
1029 : check the next opcode. */
1030 :
1031 0 : if (ecode[1+LINK_SIZE] == OP_OPT)
1032 : {
1033 0 : ims = (ims & ~PCRE_IMS) | ecode[4];
1034 : DPRINTF(("ims set to %02lx at group repeat\n", ims));
1035 : }
1036 :
1037 0 : if (*ecode == OP_KETRMIN)
1038 : {
1039 0 : RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1040 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1041 0 : ecode = prev;
1042 0 : flags = match_tail_recursed;
1043 0 : goto TAIL_RECURSE;
1044 : }
1045 : else /* OP_KETRMAX */
1046 : {
1047 0 : RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
1048 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1049 0 : ecode += 1 + LINK_SIZE;
1050 0 : flags = match_tail_recursed;
1051 0 : goto TAIL_RECURSE;
1052 : }
1053 : /* Control never gets here */
1054 :
1055 : /* An alternation is the end of a branch; scan along to find the end of the
1056 : bracketed group and go to there. */
1057 :
1058 : case OP_ALT:
1059 0 : do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1060 0 : break;
1061 :
1062 : /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1063 : that it may occur zero times. It may repeat infinitely, or not at all -
1064 : i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1065 : repeat limits are compiled as a number of copies, with the optional ones
1066 : preceded by BRAZERO or BRAMINZERO. */
1067 :
1068 : case OP_BRAZERO:
1069 : {
1070 132 : next = ecode+1;
1071 132 : RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
1072 132 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1073 132 : do next += GET(next,1); while (*next == OP_ALT);
1074 132 : ecode = next + 1 + LINK_SIZE;
1075 : }
1076 132 : break;
1077 :
1078 : case OP_BRAMINZERO:
1079 : {
1080 0 : next = ecode+1;
1081 0 : do next += GET(next, 1); while (*next == OP_ALT);
1082 0 : RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1083 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1084 0 : ecode++;
1085 : }
1086 0 : break;
1087 :
1088 : /* End of a group, repeated or non-repeating. */
1089 :
1090 : case OP_KET:
1091 : case OP_KETRMIN:
1092 : case OP_KETRMAX:
1093 5391 : prev = ecode - GET(ecode, 1);
1094 :
1095 : /* If this was a group that remembered the subject start, in order to break
1096 : infinite repeats of empty string matches, retrieve the subject start from
1097 : the chain. Otherwise, set it NULL. */
1098 :
1099 5391 : if (*prev >= OP_SBRA)
1100 : {
1101 0 : saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1102 0 : eptrb = eptrb->epb_prev; /* Backup to previous group */
1103 : }
1104 5391 : else saved_eptr = NULL;
1105 :
1106 : /* If we are at the end of an assertion group, stop matching and return
1107 : MATCH_MATCH, but record the current high water mark for use by positive
1108 : assertions. Do this also for the "once" (atomic) groups. */
1109 :
1110 5391 : if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1111 : *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1112 : *prev == OP_ONCE)
1113 : {
1114 0 : md->end_match_ptr = eptr; /* For ONCE */
1115 0 : md->end_offset_top = offset_top;
1116 0 : RRETURN(MATCH_MATCH);
1117 : }
1118 :
1119 : /* For capturing groups we have to check the group number back at the start
1120 : and if necessary complete handling an extraction by setting the offsets and
1121 : bumping the high water mark. Note that whole-pattern recursion is coded as
1122 : a recurse into group 0, so it won't be picked up here. Instead, we catch it
1123 : when the OP_END is reached. Other recursion is handled here. */
1124 :
1125 5391 : if (*prev == OP_CBRA || *prev == OP_SCBRA)
1126 : {
1127 366 : number = GET2(prev, 1+LINK_SIZE);
1128 366 : offset = number << 1;
1129 :
1130 : #ifdef DEBUG
1131 : printf("end bracket %d", number);
1132 : printf("\n");
1133 : #endif
1134 :
1135 366 : md->capture_last = number;
1136 366 : if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1137 : {
1138 366 : md->offset_vector[offset] =
1139 : md->offset_vector[md->offset_end - number];
1140 366 : md->offset_vector[offset+1] = eptr - md->start_subject;
1141 366 : if (offset_top <= offset) offset_top = offset + 2;
1142 : }
1143 :
1144 : /* Handle a recursively called group. Restore the offsets
1145 : appropriately and continue from after the call. */
1146 :
1147 366 : if (md->recursive != NULL && md->recursive->group_num == number)
1148 : {
1149 0 : recursion_info *rec = md->recursive;
1150 : DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1151 0 : md->recursive = rec->prevrec;
1152 0 : md->start_match = rec->save_start;
1153 0 : memcpy(md->offset_vector, rec->offset_save,
1154 : rec->saved_max * sizeof(int));
1155 0 : ecode = rec->after_call;
1156 0 : ims = original_ims;
1157 0 : break;
1158 : }
1159 : }
1160 :
1161 : /* For both capturing and non-capturing groups, reset the value of the ims
1162 : flags, in case they got changed during the group. */
1163 :
1164 5391 : ims = original_ims;
1165 : DPRINTF(("ims reset to %02lx\n", ims));
1166 :
1167 : /* For a non-repeating ket, just continue at this level. This also
1168 : happens for a repeating ket if no characters were matched in the group.
1169 : This is the forcible breaking of infinite loops as implemented in Perl
1170 : 5.005. If there is an options reset, it will get obeyed in the normal
1171 : course of events. */
1172 :
1173 5391 : if (*ecode == OP_KET || eptr == saved_eptr)
1174 : {
1175 5391 : ecode += 1 + LINK_SIZE;
1176 5391 : break;
1177 : }
1178 :
1179 : /* The repeating kets try the rest of the pattern or restart from the
1180 : preceding bracket, in the appropriate order. In the second case, we can use
1181 : tail recursion to avoid using another stack frame. */
1182 :
1183 0 : flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1184 :
1185 0 : if (*ecode == OP_KETRMIN)
1186 : {
1187 0 : RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1188 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1189 0 : ecode = prev;
1190 0 : flags |= match_tail_recursed;
1191 0 : goto TAIL_RECURSE;
1192 : }
1193 : else /* OP_KETRMAX */
1194 : {
1195 0 : RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
1196 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197 0 : ecode += 1 + LINK_SIZE;
1198 0 : flags = match_tail_recursed;
1199 0 : goto TAIL_RECURSE;
1200 : }
1201 : /* Control never gets here */
1202 :
1203 : /* Start of subject unless notbol, or after internal newline if multiline */
1204 :
1205 : case OP_CIRC:
1206 16968 : if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1207 16968 : if ((ims & PCRE_MULTILINE) != 0)
1208 : {
1209 0 : if (eptr != md->start_subject &&
1210 : (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1211 0 : RRETURN(MATCH_NOMATCH);
1212 0 : ecode++;
1213 0 : break;
1214 : }
1215 : /* ... else fall through */
1216 :
1217 : /* Start of subject assertion */
1218 :
1219 : case OP_SOD:
1220 16968 : if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1221 16968 : ecode++;
1222 16968 : break;
1223 :
1224 : /* Start of match assertion */
1225 :
1226 : case OP_SOM:
1227 0 : if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1228 0 : ecode++;
1229 0 : break;
1230 :
1231 : /* Assert before internal newline if multiline, or before a terminating
1232 : newline unless endonly is set, else end of subject unless noteol is set. */
1233 :
1234 : case OP_DOLL:
1235 7870 : if ((ims & PCRE_MULTILINE) != 0)
1236 : {
1237 0 : if (eptr < md->end_subject)
1238 0 : { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1239 : else
1240 0 : { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1241 0 : ecode++;
1242 0 : break;
1243 : }
1244 : else
1245 : {
1246 7870 : if (md->noteol) RRETURN(MATCH_NOMATCH);
1247 7870 : if (!md->endonly)
1248 : {
1249 7870 : if (eptr != md->end_subject &&
1250 : (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1251 3452 : RRETURN(MATCH_NOMATCH);
1252 4418 : ecode++;
1253 4418 : break;
1254 : }
1255 : }
1256 : /* ... else fall through for endonly */
1257 :
1258 : /* End of subject assertion (\z) */
1259 :
1260 : case OP_EOD:
1261 0 : if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1262 0 : ecode++;
1263 0 : break;
1264 :
1265 : /* End of subject or ending \n assertion (\Z) */
1266 :
1267 : case OP_EODN:
1268 0 : if (eptr != md->end_subject &&
1269 : (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1270 0 : RRETURN(MATCH_NOMATCH);
1271 0 : ecode++;
1272 0 : break;
1273 :
1274 : /* Word boundary assertions */
1275 :
1276 : case OP_NOT_WORD_BOUNDARY:
1277 : case OP_WORD_BOUNDARY:
1278 : {
1279 :
1280 : /* Find out if the previous and current characters are "word" characters.
1281 : It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1282 : be "non-word" characters. */
1283 :
1284 : #ifdef SUPPORT_UTF8
1285 0 : if (utf8)
1286 : {
1287 0 : if (eptr == md->start_subject) prev_is_word = FALSE; else
1288 : {
1289 0 : const uschar *lastptr = eptr - 1;
1290 0 : while((*lastptr & 0xc0) == 0x80) lastptr--;
1291 0 : GETCHAR(c, lastptr);
1292 0 : prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1293 : }
1294 0 : if (eptr >= md->end_subject) cur_is_word = FALSE; else
1295 : {
1296 0 : GETCHAR(c, eptr);
1297 0 : cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1298 : }
1299 : }
1300 : else
1301 : #endif
1302 :
1303 : /* More streamlined when not in UTF-8 mode */
1304 :
1305 : {
1306 0 : prev_is_word = (eptr != md->start_subject) &&
1307 : ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1308 0 : cur_is_word = (eptr < md->end_subject) &&
1309 : ((md->ctypes[*eptr] & ctype_word) != 0);
1310 : }
1311 :
1312 : /* Now see if the situation is what we want */
1313 :
1314 0 : if ((*ecode++ == OP_WORD_BOUNDARY)?
1315 : cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1316 0 : RRETURN(MATCH_NOMATCH);
1317 : }
1318 0 : break;
1319 :
1320 : /* Match a single character type; inline for speed */
1321 :
1322 : case OP_ANY:
1323 0 : if ((ims & PCRE_DOTALL) == 0)
1324 : {
1325 0 : if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1326 : }
1327 0 : if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1328 0 : if (utf8)
1329 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1330 0 : ecode++;
1331 0 : break;
1332 :
1333 : /* Match a single byte, even in UTF-8 mode. This opcode really does match
1334 : any byte, even newline, independent of the setting of PCRE_DOTALL. */
1335 :
1336 : case OP_ANYBYTE:
1337 0 : if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1338 0 : ecode++;
1339 0 : break;
1340 :
1341 : case OP_NOT_DIGIT:
1342 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1343 0 : GETCHARINCTEST(c, eptr);
1344 0 : if (
1345 : #ifdef SUPPORT_UTF8
1346 : c < 256 &&
1347 : #endif
1348 : (md->ctypes[c] & ctype_digit) != 0
1349 : )
1350 0 : RRETURN(MATCH_NOMATCH);
1351 0 : ecode++;
1352 0 : break;
1353 :
1354 : case OP_DIGIT:
1355 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1356 0 : GETCHARINCTEST(c, eptr);
1357 0 : if (
1358 : #ifdef SUPPORT_UTF8
1359 : c >= 256 ||
1360 : #endif
1361 : (md->ctypes[c] & ctype_digit) == 0
1362 : )
1363 0 : RRETURN(MATCH_NOMATCH);
1364 0 : ecode++;
1365 0 : break;
1366 :
1367 : case OP_NOT_WHITESPACE:
1368 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1369 0 : GETCHARINCTEST(c, eptr);
1370 0 : if (
1371 : #ifdef SUPPORT_UTF8
1372 : c < 256 &&
1373 : #endif
1374 : (md->ctypes[c] & ctype_space) != 0
1375 : )
1376 0 : RRETURN(MATCH_NOMATCH);
1377 0 : ecode++;
1378 0 : break;
1379 :
1380 : case OP_WHITESPACE:
1381 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1382 0 : GETCHARINCTEST(c, eptr);
1383 0 : if (
1384 : #ifdef SUPPORT_UTF8
1385 : c >= 256 ||
1386 : #endif
1387 : (md->ctypes[c] & ctype_space) == 0
1388 : )
1389 0 : RRETURN(MATCH_NOMATCH);
1390 0 : ecode++;
1391 0 : break;
1392 :
1393 : case OP_NOT_WORDCHAR:
1394 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1395 0 : GETCHARINCTEST(c, eptr);
1396 0 : if (
1397 : #ifdef SUPPORT_UTF8
1398 : c < 256 &&
1399 : #endif
1400 : (md->ctypes[c] & ctype_word) != 0
1401 : )
1402 0 : RRETURN(MATCH_NOMATCH);
1403 0 : ecode++;
1404 0 : break;
1405 :
1406 : case OP_WORDCHAR:
1407 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1408 0 : GETCHARINCTEST(c, eptr);
1409 0 : if (
1410 : #ifdef SUPPORT_UTF8
1411 : c >= 256 ||
1412 : #endif
1413 : (md->ctypes[c] & ctype_word) == 0
1414 : )
1415 0 : RRETURN(MATCH_NOMATCH);
1416 0 : ecode++;
1417 0 : break;
1418 :
1419 : case OP_ANYNL:
1420 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1421 0 : GETCHARINCTEST(c, eptr);
1422 0 : switch(c)
1423 : {
1424 0 : default: RRETURN(MATCH_NOMATCH);
1425 : case 0x000d:
1426 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1427 : break;
1428 : case 0x000a:
1429 : case 0x000b:
1430 : case 0x000c:
1431 : case 0x0085:
1432 : case 0x2028:
1433 : case 0x2029:
1434 : break;
1435 : }
1436 0 : ecode++;
1437 0 : break;
1438 :
1439 : #ifdef SUPPORT_UCP
1440 : /* Check the next character by Unicode property. We will get here only
1441 : if the support is in the binary; otherwise a compile-time error occurs. */
1442 :
1443 : case OP_PROP:
1444 : case OP_NOTPROP:
1445 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1446 0 : GETCHARINCTEST(c, eptr);
1447 : {
1448 : int chartype, script;
1449 0 : int category = _pcre_ucp_findprop(c, &chartype, &script);
1450 :
1451 0 : switch(ecode[1])
1452 : {
1453 : case PT_ANY:
1454 0 : if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1455 0 : break;
1456 :
1457 : case PT_LAMP:
1458 0 : if ((chartype == ucp_Lu ||
1459 : chartype == ucp_Ll ||
1460 : chartype == ucp_Lt) == (op == OP_NOTPROP))
1461 0 : RRETURN(MATCH_NOMATCH);
1462 0 : break;
1463 :
1464 : case PT_GC:
1465 0 : if ((ecode[2] != category) == (op == OP_PROP))
1466 0 : RRETURN(MATCH_NOMATCH);
1467 0 : break;
1468 :
1469 : case PT_PC:
1470 0 : if ((ecode[2] != chartype) == (op == OP_PROP))
1471 0 : RRETURN(MATCH_NOMATCH);
1472 0 : break;
1473 :
1474 : case PT_SC:
1475 0 : if ((ecode[2] != script) == (op == OP_PROP))
1476 0 : RRETURN(MATCH_NOMATCH);
1477 0 : break;
1478 :
1479 : default:
1480 0 : RRETURN(PCRE_ERROR_INTERNAL);
1481 : }
1482 :
1483 0 : ecode += 3;
1484 : }
1485 0 : break;
1486 :
1487 : /* Match an extended Unicode sequence. We will get here only if the support
1488 : is in the binary; otherwise a compile-time error occurs. */
1489 :
1490 : case OP_EXTUNI:
1491 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1492 0 : GETCHARINCTEST(c, eptr);
1493 : {
1494 : int chartype, script;
1495 0 : int category = _pcre_ucp_findprop(c, &chartype, &script);
1496 0 : if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1497 0 : while (eptr < md->end_subject)
1498 : {
1499 0 : int len = 1;
1500 0 : if (!utf8) c = *eptr; else
1501 : {
1502 0 : GETCHARLEN(c, eptr, len);
1503 : }
1504 0 : category = _pcre_ucp_findprop(c, &chartype, &script);
1505 0 : if (category != ucp_M) break;
1506 0 : eptr += len;
1507 : }
1508 : }
1509 0 : ecode++;
1510 0 : break;
1511 : #endif
1512 :
1513 :
1514 : /* Match a back reference, possibly repeatedly. Look past the end of the
1515 : item to see if there is repeat information following. The code is similar
1516 : to that for character classes, but repeated for efficiency. Then obey
1517 : similar code to character type repeats - written out again for speed.
1518 : However, if the referenced string is the empty string, always treat
1519 : it as matched, any number of times (otherwise there could be infinite
1520 : loops). */
1521 :
1522 : case OP_REF:
1523 : {
1524 0 : offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1525 0 : ecode += 3; /* Advance past item */
1526 :
1527 : /* If the reference is unset, set the length to be longer than the amount
1528 : of subject left; this ensures that every attempt at a match fails. We
1529 : can't just fail here, because of the possibility of quantifiers with zero
1530 : minima. */
1531 :
1532 0 : length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1533 : md->end_subject - eptr + 1 :
1534 : md->offset_vector[offset+1] - md->offset_vector[offset];
1535 :
1536 : /* Set up for repetition, or handle the non-repeated case */
1537 :
1538 0 : switch (*ecode)
1539 : {
1540 : case OP_CRSTAR:
1541 : case OP_CRMINSTAR:
1542 : case OP_CRPLUS:
1543 : case OP_CRMINPLUS:
1544 : case OP_CRQUERY:
1545 : case OP_CRMINQUERY:
1546 0 : c = *ecode++ - OP_CRSTAR;
1547 0 : minimize = (c & 1) != 0;
1548 0 : min = rep_min[c]; /* Pick up values from tables; */
1549 0 : max = rep_max[c]; /* zero for max => infinity */
1550 0 : if (max == 0) max = INT_MAX;
1551 0 : break;
1552 :
1553 : case OP_CRRANGE:
1554 : case OP_CRMINRANGE:
1555 0 : minimize = (*ecode == OP_CRMINRANGE);
1556 0 : min = GET2(ecode, 1);
1557 0 : max = GET2(ecode, 3);
1558 0 : if (max == 0) max = INT_MAX;
1559 0 : ecode += 5;
1560 0 : break;
1561 :
1562 : default: /* No repeat follows */
1563 0 : if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1564 0 : eptr += length;
1565 0 : continue; /* With the main loop */
1566 : }
1567 :
1568 : /* If the length of the reference is zero, just continue with the
1569 : main loop. */
1570 :
1571 0 : if (length == 0) continue;
1572 :
1573 : /* First, ensure the minimum number of matches are present. We get back
1574 : the length of the reference string explicitly rather than passing the
1575 : address of eptr, so that eptr can be a register variable. */
1576 :
1577 0 : for (i = 1; i <= min; i++)
1578 : {
1579 0 : if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1580 0 : eptr += length;
1581 : }
1582 :
1583 : /* If min = max, continue at the same level without recursion.
1584 : They are not both allowed to be zero. */
1585 :
1586 0 : if (min == max) continue;
1587 :
1588 : /* If minimizing, keep trying and advancing the pointer */
1589 :
1590 0 : if (minimize)
1591 : {
1592 0 : for (fi = min;; fi++)
1593 : {
1594 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1595 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1596 0 : if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1597 0 : RRETURN(MATCH_NOMATCH);
1598 0 : eptr += length;
1599 0 : }
1600 : /* Control never gets here */
1601 : }
1602 :
1603 : /* If maximizing, find the longest string and work backwards */
1604 :
1605 : else
1606 : {
1607 0 : pp = eptr;
1608 0 : for (i = min; i < max; i++)
1609 : {
1610 0 : if (!match_ref(offset, eptr, length, md, ims)) break;
1611 0 : eptr += length;
1612 : }
1613 0 : while (eptr >= pp)
1614 : {
1615 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1616 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1617 0 : eptr -= length;
1618 : }
1619 0 : RRETURN(MATCH_NOMATCH);
1620 : }
1621 : }
1622 : /* Control never gets here */
1623 :
1624 :
1625 :
1626 : /* Match a bit-mapped character class, possibly repeatedly. This op code is
1627 : used when all the characters in the class have values in the range 0-255,
1628 : and either the matching is caseful, or the characters are in the range
1629 : 0-127 when UTF-8 processing is enabled. The only difference between
1630 : OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1631 : encountered.
1632 :
1633 : First, look past the end of the item to see if there is repeat information
1634 : following. Then obey similar code to character type repeats - written out
1635 : again for speed. */
1636 :
1637 : case OP_NCLASS:
1638 : case OP_CLASS:
1639 : {
1640 1206 : data = ecode + 1; /* Save for matching */
1641 1206 : ecode += 33; /* Advance past the item */
1642 :
1643 1206 : switch (*ecode)
1644 : {
1645 : case OP_CRSTAR:
1646 : case OP_CRMINSTAR:
1647 : case OP_CRPLUS:
1648 : case OP_CRMINPLUS:
1649 : case OP_CRQUERY:
1650 : case OP_CRMINQUERY:
1651 1206 : c = *ecode++ - OP_CRSTAR;
1652 1206 : minimize = (c & 1) != 0;
1653 1206 : min = rep_min[c]; /* Pick up values from tables; */
1654 1206 : max = rep_max[c]; /* zero for max => infinity */
1655 1206 : if (max == 0) max = INT_MAX;
1656 1206 : break;
1657 :
1658 : case OP_CRRANGE:
1659 : case OP_CRMINRANGE:
1660 0 : minimize = (*ecode == OP_CRMINRANGE);
1661 0 : min = GET2(ecode, 1);
1662 0 : max = GET2(ecode, 3);
1663 0 : if (max == 0) max = INT_MAX;
1664 0 : ecode += 5;
1665 0 : break;
1666 :
1667 : default: /* No repeat follows */
1668 0 : min = max = 1;
1669 : break;
1670 : }
1671 :
1672 : /* First, ensure the minimum number of matches are present. */
1673 :
1674 : #ifdef SUPPORT_UTF8
1675 : /* UTF-8 mode */
1676 1206 : if (utf8)
1677 : {
1678 0 : for (i = 1; i <= min; i++)
1679 : {
1680 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1681 0 : GETCHARINC(c, eptr);
1682 0 : if (c > 255)
1683 : {
1684 0 : if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1685 : }
1686 : else
1687 : {
1688 0 : if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1689 : }
1690 : }
1691 : }
1692 : else
1693 : #endif
1694 : /* Not UTF-8 mode */
1695 : {
1696 2043 : for (i = 1; i <= min; i++)
1697 : {
1698 968 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1699 967 : c = *eptr++;
1700 967 : if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1701 : }
1702 : }
1703 :
1704 : /* If max == min we can continue with the main loop without the
1705 : need to recurse. */
1706 :
1707 1075 : if (min == max) continue;
1708 :
1709 : /* If minimizing, keep testing the rest of the expression and advancing
1710 : the pointer while it matches the class. */
1711 :
1712 1075 : if (minimize)
1713 : {
1714 : #ifdef SUPPORT_UTF8
1715 : /* UTF-8 mode */
1716 0 : if (utf8)
1717 : {
1718 0 : for (fi = min;; fi++)
1719 : {
1720 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1721 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1722 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1723 0 : GETCHARINC(c, eptr);
1724 0 : if (c > 255)
1725 : {
1726 0 : if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1727 : }
1728 : else
1729 : {
1730 0 : if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1731 : }
1732 0 : }
1733 : }
1734 : else
1735 : #endif
1736 : /* Not UTF-8 mode */
1737 : {
1738 0 : for (fi = min;; fi++)
1739 : {
1740 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1741 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1742 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1743 0 : c = *eptr++;
1744 0 : if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1745 0 : }
1746 : }
1747 : /* Control never gets here */
1748 : }
1749 :
1750 : /* If maximizing, find the longest possible run, then work backwards. */
1751 :
1752 : else
1753 : {
1754 1075 : pp = eptr;
1755 :
1756 : #ifdef SUPPORT_UTF8
1757 : /* UTF-8 mode */
1758 1075 : if (utf8)
1759 : {
1760 0 : for (i = min; i < max; i++)
1761 : {
1762 0 : int len = 1;
1763 0 : if (eptr >= md->end_subject) break;
1764 0 : GETCHARLEN(c, eptr, len);
1765 0 : if (c > 255)
1766 : {
1767 0 : if (op == OP_CLASS) break;
1768 : }
1769 : else
1770 : {
1771 0 : if ((data[c/8] & (1 << (c&7))) == 0) break;
1772 : }
1773 0 : eptr += len;
1774 : }
1775 : for (;;)
1776 : {
1777 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1778 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1779 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
1780 0 : BACKCHAR(eptr);
1781 0 : }
1782 : }
1783 : else
1784 : #endif
1785 : /* Not UTF-8 mode */
1786 : {
1787 4106 : for (i = min; i < max; i++)
1788 : {
1789 4106 : if (eptr >= md->end_subject) break;
1790 4105 : c = *eptr;
1791 4105 : if ((data[c/8] & (1 << (c&7))) == 0) break;
1792 3031 : eptr++;
1793 : }
1794 2240 : while (eptr >= pp)
1795 : {
1796 1138 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1797 1138 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1798 90 : eptr--;
1799 : }
1800 : }
1801 :
1802 27 : RRETURN(MATCH_NOMATCH);
1803 : }
1804 : }
1805 : /* Control never gets here */
1806 :
1807 :
1808 : /* Match an extended character class. This opcode is encountered only
1809 : in UTF-8 mode, because that's the only time it is compiled. */
1810 :
1811 : #ifdef SUPPORT_UTF8
1812 : case OP_XCLASS:
1813 : {
1814 0 : data = ecode + 1 + LINK_SIZE; /* Save for matching */
1815 0 : ecode += GET(ecode, 1); /* Advance past the item */
1816 :
1817 0 : switch (*ecode)
1818 : {
1819 : case OP_CRSTAR:
1820 : case OP_CRMINSTAR:
1821 : case OP_CRPLUS:
1822 : case OP_CRMINPLUS:
1823 : case OP_CRQUERY:
1824 : case OP_CRMINQUERY:
1825 0 : c = *ecode++ - OP_CRSTAR;
1826 0 : minimize = (c & 1) != 0;
1827 0 : min = rep_min[c]; /* Pick up values from tables; */
1828 0 : max = rep_max[c]; /* zero for max => infinity */
1829 0 : if (max == 0) max = INT_MAX;
1830 0 : break;
1831 :
1832 : case OP_CRRANGE:
1833 : case OP_CRMINRANGE:
1834 0 : minimize = (*ecode == OP_CRMINRANGE);
1835 0 : min = GET2(ecode, 1);
1836 0 : max = GET2(ecode, 3);
1837 0 : if (max == 0) max = INT_MAX;
1838 0 : ecode += 5;
1839 0 : break;
1840 :
1841 : default: /* No repeat follows */
1842 0 : min = max = 1;
1843 : break;
1844 : }
1845 :
1846 : /* First, ensure the minimum number of matches are present. */
1847 :
1848 0 : for (i = 1; i <= min; i++)
1849 : {
1850 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1851 0 : GETCHARINC(c, eptr);
1852 0 : if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1853 : }
1854 :
1855 : /* If max == min we can continue with the main loop without the
1856 : need to recurse. */
1857 :
1858 0 : if (min == max) continue;
1859 :
1860 : /* If minimizing, keep testing the rest of the expression and advancing
1861 : the pointer while it matches the class. */
1862 :
1863 0 : if (minimize)
1864 : {
1865 0 : for (fi = min;; fi++)
1866 : {
1867 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1868 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1869 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1870 0 : GETCHARINC(c, eptr);
1871 0 : if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1872 0 : }
1873 : /* Control never gets here */
1874 : }
1875 :
1876 : /* If maximizing, find the longest possible run, then work backwards. */
1877 :
1878 : else
1879 : {
1880 0 : pp = eptr;
1881 0 : for (i = min; i < max; i++)
1882 : {
1883 0 : int len = 1;
1884 0 : if (eptr >= md->end_subject) break;
1885 0 : GETCHARLEN(c, eptr, len);
1886 0 : if (!_pcre_xclass(c, data)) break;
1887 0 : eptr += len;
1888 : }
1889 : for(;;)
1890 : {
1891 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1892 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1893 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
1894 0 : BACKCHAR(eptr)
1895 0 : }
1896 0 : RRETURN(MATCH_NOMATCH);
1897 : }
1898 :
1899 : /* Control never gets here */
1900 : }
1901 : #endif /* End of XCLASS */
1902 :
1903 : /* Match a single character, casefully */
1904 :
1905 : case OP_CHAR:
1906 : #ifdef SUPPORT_UTF8
1907 146718 : if (utf8)
1908 : {
1909 0 : length = 1;
1910 0 : ecode++;
1911 0 : GETCHARLEN(fc, ecode, length);
1912 0 : if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1913 0 : while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1914 : }
1915 : else
1916 : #endif
1917 :
1918 : /* Non-UTF-8 mode */
1919 : {
1920 146718 : if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1921 145946 : if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1922 122867 : ecode += 2;
1923 : }
1924 122867 : break;
1925 :
1926 : /* Match a single character, caselessly */
1927 :
1928 : case OP_CHARNC:
1929 : #ifdef SUPPORT_UTF8
1930 4 : if (utf8)
1931 : {
1932 0 : length = 1;
1933 0 : ecode++;
1934 0 : GETCHARLEN(fc, ecode, length);
1935 :
1936 0 : if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1937 :
1938 : /* If the pattern character's value is < 128, we have only one byte, and
1939 : can use the fast lookup table. */
1940 :
1941 0 : if (fc < 128)
1942 : {
1943 0 : if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1944 : }
1945 :
1946 : /* Otherwise we must pick up the subject character */
1947 :
1948 : else
1949 : {
1950 : unsigned int dc;
1951 0 : GETCHARINC(dc, eptr);
1952 0 : ecode += length;
1953 :
1954 : /* If we have Unicode property support, we can use it to test the other
1955 : case of the character, if there is one. */
1956 :
1957 0 : if (fc != dc)
1958 : {
1959 : #ifdef SUPPORT_UCP
1960 0 : if (dc != _pcre_ucp_othercase(fc))
1961 : #endif
1962 0 : RRETURN(MATCH_NOMATCH);
1963 : }
1964 : }
1965 : }
1966 : else
1967 : #endif /* SUPPORT_UTF8 */
1968 :
1969 : /* Non-UTF-8 mode */
1970 : {
1971 4 : if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1972 4 : if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1973 4 : ecode += 2;
1974 : }
1975 4 : break;
1976 :
1977 : /* Match a single character repeatedly. */
1978 :
1979 : case OP_EXACT:
1980 0 : min = max = GET2(ecode, 1);
1981 0 : ecode += 3;
1982 0 : goto REPEATCHAR;
1983 :
1984 : case OP_POSUPTO:
1985 0 : possessive = TRUE;
1986 : /* Fall through */
1987 :
1988 : case OP_UPTO:
1989 : case OP_MINUPTO:
1990 0 : min = 0;
1991 0 : max = GET2(ecode, 1);
1992 0 : minimize = *ecode == OP_MINUPTO;
1993 0 : ecode += 3;
1994 0 : goto REPEATCHAR;
1995 :
1996 : case OP_POSSTAR:
1997 3 : possessive = TRUE;
1998 3 : min = 0;
1999 3 : max = INT_MAX;
2000 3 : ecode++;
2001 3 : goto REPEATCHAR;
2002 :
2003 : case OP_POSPLUS:
2004 0 : possessive = TRUE;
2005 0 : min = 1;
2006 0 : max = INT_MAX;
2007 0 : ecode++;
2008 0 : goto REPEATCHAR;
2009 :
2010 : case OP_POSQUERY:
2011 0 : possessive = TRUE;
2012 0 : min = 0;
2013 0 : max = 1;
2014 0 : ecode++;
2015 0 : goto REPEATCHAR;
2016 :
2017 : case OP_STAR:
2018 : case OP_MINSTAR:
2019 : case OP_PLUS:
2020 : case OP_MINPLUS:
2021 : case OP_QUERY:
2022 : case OP_MINQUERY:
2023 236 : c = *ecode++ - OP_STAR;
2024 236 : minimize = (c & 1) != 0;
2025 236 : min = rep_min[c]; /* Pick up values from tables; */
2026 236 : max = rep_max[c]; /* zero for max => infinity */
2027 236 : if (max == 0) max = INT_MAX;
2028 :
2029 : /* Common code for all repeated single-character matches. We can give
2030 : up quickly if there are fewer than the minimum number of characters left in
2031 : the subject. */
2032 :
2033 239 : REPEATCHAR:
2034 : #ifdef SUPPORT_UTF8
2035 239 : if (utf8)
2036 : {
2037 0 : length = 1;
2038 0 : charptr = ecode;
2039 0 : GETCHARLEN(fc, ecode, length);
2040 0 : if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2041 0 : ecode += length;
2042 :
2043 : /* Handle multibyte character matching specially here. There is
2044 : support for caseless matching if UCP support is present. */
2045 :
2046 0 : if (length > 1)
2047 : {
2048 0 : int oclength = 0;
2049 : uschar occhars[8];
2050 :
2051 : #ifdef SUPPORT_UCP
2052 : unsigned int othercase;
2053 0 : if ((ims & PCRE_CASELESS) != 0 &&
2054 : (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2055 0 : oclength = _pcre_ord2utf8(othercase, occhars);
2056 : #endif /* SUPPORT_UCP */
2057 :
2058 0 : for (i = 1; i <= min; i++)
2059 : {
2060 0 : if (memcmp(eptr, charptr, length) == 0) eptr += length;
2061 : /* Need braces because of following else */
2062 0 : else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2063 : else
2064 : {
2065 0 : if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2066 0 : eptr += oclength;
2067 : }
2068 : }
2069 :
2070 0 : if (min == max) continue;
2071 :
2072 0 : if (minimize)
2073 : {
2074 0 : for (fi = min;; fi++)
2075 : {
2076 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2077 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2078 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2079 0 : if (memcmp(eptr, charptr, length) == 0) eptr += length;
2080 : /* Need braces because of following else */
2081 0 : else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2082 : else
2083 : {
2084 0 : if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2085 0 : eptr += oclength;
2086 : }
2087 0 : }
2088 : /* Control never gets here */
2089 : }
2090 :
2091 : else /* Maximize */
2092 : {
2093 0 : pp = eptr;
2094 0 : for (i = min; i < max; i++)
2095 : {
2096 0 : if (eptr > md->end_subject - length) break;
2097 0 : if (memcmp(eptr, charptr, length) == 0) eptr += length;
2098 0 : else if (oclength == 0) break;
2099 : else
2100 : {
2101 0 : if (memcmp(eptr, occhars, oclength) != 0) break;
2102 0 : eptr += oclength;
2103 : }
2104 : }
2105 :
2106 0 : if (possessive) continue;
2107 0 : while (eptr >= pp)
2108 : {
2109 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2110 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2111 0 : eptr -= length;
2112 : }
2113 0 : RRETURN(MATCH_NOMATCH);
2114 : }
2115 : /* Control never gets here */
2116 : }
2117 :
2118 : /* If the length of a UTF-8 character is 1, we fall through here, and
2119 : obey the code as for non-UTF-8 characters below, though in this case the
2120 : value of fc will always be < 128. */
2121 : }
2122 : else
2123 : #endif /* SUPPORT_UTF8 */
2124 :
2125 : /* When not in UTF-8 mode, load a single-byte character. */
2126 : {
2127 239 : if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2128 239 : fc = *ecode++;
2129 : }
2130 :
2131 : /* The value of fc at this point is always less than 256, though we may or
2132 : may not be in UTF-8 mode. The code is duplicated for the caseless and
2133 : caseful cases, for speed, since matching characters is likely to be quite
2134 : common. First, ensure the minimum number of matches are present. If min =
2135 : max, continue at the same level without recursing. Otherwise, if
2136 : minimizing, keep trying the rest of the expression and advancing one
2137 : matching character if failing, up to the maximum. Alternatively, if
2138 : maximizing, find the maximum number of characters and work backwards. */
2139 :
2140 : DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2141 : max, eptr));
2142 :
2143 239 : if ((ims & PCRE_CASELESS) != 0)
2144 : {
2145 0 : fc = md->lcc[fc];
2146 0 : for (i = 1; i <= min; i++)
2147 0 : if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2148 0 : if (min == max) continue;
2149 0 : if (minimize)
2150 : {
2151 0 : for (fi = min;; fi++)
2152 : {
2153 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2154 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2155 0 : if (fi >= max || eptr >= md->end_subject ||
2156 : fc != md->lcc[*eptr++])
2157 0 : RRETURN(MATCH_NOMATCH);
2158 0 : }
2159 : /* Control never gets here */
2160 : }
2161 : else /* Maximize */
2162 : {
2163 0 : pp = eptr;
2164 0 : for (i = min; i < max; i++)
2165 : {
2166 0 : if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2167 0 : eptr++;
2168 : }
2169 0 : if (possessive) continue;
2170 0 : while (eptr >= pp)
2171 : {
2172 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2173 0 : eptr--;
2174 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2175 : }
2176 0 : RRETURN(MATCH_NOMATCH);
2177 : }
2178 : /* Control never gets here */
2179 : }
2180 :
2181 : /* Caseful comparisons (includes all multi-byte characters) */
2182 :
2183 : else
2184 : {
2185 239 : for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2186 239 : if (min == max) continue;
2187 239 : if (minimize)
2188 : {
2189 0 : for (fi = min;; fi++)
2190 : {
2191 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2192 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2193 0 : if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2194 0 : RRETURN(MATCH_NOMATCH);
2195 0 : }
2196 : /* Control never gets here */
2197 : }
2198 : else /* Maximize */
2199 : {
2200 239 : pp = eptr;
2201 354 : for (i = min; i < max; i++)
2202 : {
2203 244 : if (eptr >= md->end_subject || fc != *eptr) break;
2204 115 : eptr++;
2205 : }
2206 239 : if (possessive) continue;
2207 478 : while (eptr >= pp)
2208 : {
2209 238 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2210 238 : eptr--;
2211 238 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2212 : }
2213 4 : RRETURN(MATCH_NOMATCH);
2214 : }
2215 : }
2216 : /* Control never gets here */
2217 :
2218 : /* Match a negated single one-byte character. The character we are
2219 : checking can be multibyte. */
2220 :
2221 : case OP_NOT:
2222 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2223 0 : ecode++;
2224 0 : GETCHARINCTEST(c, eptr);
2225 0 : if ((ims & PCRE_CASELESS) != 0)
2226 : {
2227 : #ifdef SUPPORT_UTF8
2228 0 : if (c < 256)
2229 : #endif
2230 0 : c = md->lcc[c];
2231 0 : if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2232 : }
2233 : else
2234 : {
2235 0 : if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2236 : }
2237 0 : break;
2238 :
2239 : /* Match a negated single one-byte character repeatedly. This is almost a
2240 : repeat of the code for a repeated single character, but I haven't found a
2241 : nice way of commoning these up that doesn't require a test of the
2242 : positive/negative option for each character match. Maybe that wouldn't add
2243 : very much to the time taken, but character matching *is* what this is all
2244 : about... */
2245 :
2246 : case OP_NOTEXACT:
2247 0 : min = max = GET2(ecode, 1);
2248 0 : ecode += 3;
2249 0 : goto REPEATNOTCHAR;
2250 :
2251 : case OP_NOTUPTO:
2252 : case OP_NOTMINUPTO:
2253 0 : min = 0;
2254 0 : max = GET2(ecode, 1);
2255 0 : minimize = *ecode == OP_NOTMINUPTO;
2256 0 : ecode += 3;
2257 0 : goto REPEATNOTCHAR;
2258 :
2259 : case OP_NOTPOSSTAR:
2260 0 : possessive = TRUE;
2261 0 : min = 0;
2262 0 : max = INT_MAX;
2263 0 : ecode++;
2264 0 : goto REPEATNOTCHAR;
2265 :
2266 : case OP_NOTPOSPLUS:
2267 0 : possessive = TRUE;
2268 0 : min = 1;
2269 0 : max = INT_MAX;
2270 0 : ecode++;
2271 0 : goto REPEATNOTCHAR;
2272 :
2273 : case OP_NOTPOSQUERY:
2274 0 : possessive = TRUE;
2275 0 : min = 0;
2276 0 : max = 1;
2277 0 : ecode++;
2278 0 : goto REPEATNOTCHAR;
2279 :
2280 : case OP_NOTPOSUPTO:
2281 0 : possessive = TRUE;
2282 0 : min = 0;
2283 0 : max = GET2(ecode, 1);
2284 0 : ecode += 3;
2285 0 : goto REPEATNOTCHAR;
2286 :
2287 : case OP_NOTSTAR:
2288 : case OP_NOTMINSTAR:
2289 : case OP_NOTPLUS:
2290 : case OP_NOTMINPLUS:
2291 : case OP_NOTQUERY:
2292 : case OP_NOTMINQUERY:
2293 0 : c = *ecode++ - OP_NOTSTAR;
2294 0 : minimize = (c & 1) != 0;
2295 0 : min = rep_min[c]; /* Pick up values from tables; */
2296 0 : max = rep_max[c]; /* zero for max => infinity */
2297 0 : if (max == 0) max = INT_MAX;
2298 :
2299 : /* Common code for all repeated single-byte matches. We can give up quickly
2300 : if there are fewer than the minimum number of bytes left in the
2301 : subject. */
2302 :
2303 0 : REPEATNOTCHAR:
2304 0 : if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2305 0 : fc = *ecode++;
2306 :
2307 : /* The code is duplicated for the caseless and caseful cases, for speed,
2308 : since matching characters is likely to be quite common. First, ensure the
2309 : minimum number of matches are present. If min = max, continue at the same
2310 : level without recursing. Otherwise, if minimizing, keep trying the rest of
2311 : the expression and advancing one matching character if failing, up to the
2312 : maximum. Alternatively, if maximizing, find the maximum number of
2313 : characters and work backwards. */
2314 :
2315 : DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2316 : max, eptr));
2317 :
2318 0 : if ((ims & PCRE_CASELESS) != 0)
2319 : {
2320 0 : fc = md->lcc[fc];
2321 :
2322 : #ifdef SUPPORT_UTF8
2323 : /* UTF-8 mode */
2324 0 : if (utf8)
2325 : {
2326 : register unsigned int d;
2327 0 : for (i = 1; i <= min; i++)
2328 : {
2329 0 : GETCHARINC(d, eptr);
2330 0 : if (d < 256) d = md->lcc[d];
2331 0 : if (fc == d) RRETURN(MATCH_NOMATCH);
2332 : }
2333 : }
2334 : else
2335 : #endif
2336 :
2337 : /* Not UTF-8 mode */
2338 : {
2339 0 : for (i = 1; i <= min; i++)
2340 0 : if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2341 : }
2342 :
2343 0 : if (min == max) continue;
2344 :
2345 0 : if (minimize)
2346 : {
2347 : #ifdef SUPPORT_UTF8
2348 : /* UTF-8 mode */
2349 0 : if (utf8)
2350 : {
2351 : register unsigned int d;
2352 0 : for (fi = min;; fi++)
2353 : {
2354 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2355 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2356 0 : GETCHARINC(d, eptr);
2357 0 : if (d < 256) d = md->lcc[d];
2358 0 : if (fi >= max || eptr >= md->end_subject || fc == d)
2359 0 : RRETURN(MATCH_NOMATCH);
2360 0 : }
2361 : }
2362 : else
2363 : #endif
2364 : /* Not UTF-8 mode */
2365 : {
2366 0 : for (fi = min;; fi++)
2367 : {
2368 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2369 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2370 0 : if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2371 0 : RRETURN(MATCH_NOMATCH);
2372 0 : }
2373 : }
2374 : /* Control never gets here */
2375 : }
2376 :
2377 : /* Maximize case */
2378 :
2379 : else
2380 : {
2381 0 : pp = eptr;
2382 :
2383 : #ifdef SUPPORT_UTF8
2384 : /* UTF-8 mode */
2385 0 : if (utf8)
2386 : {
2387 : register unsigned int d;
2388 0 : for (i = min; i < max; i++)
2389 : {
2390 0 : int len = 1;
2391 0 : if (eptr >= md->end_subject) break;
2392 0 : GETCHARLEN(d, eptr, len);
2393 0 : if (d < 256) d = md->lcc[d];
2394 0 : if (fc == d) break;
2395 0 : eptr += len;
2396 : }
2397 0 : if (possessive) continue;
2398 : for(;;)
2399 : {
2400 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2401 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2402 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
2403 0 : BACKCHAR(eptr);
2404 0 : }
2405 : }
2406 : else
2407 : #endif
2408 : /* Not UTF-8 mode */
2409 : {
2410 0 : for (i = min; i < max; i++)
2411 : {
2412 0 : if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2413 0 : eptr++;
2414 : }
2415 0 : if (possessive) continue;
2416 0 : while (eptr >= pp)
2417 : {
2418 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2419 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2420 0 : eptr--;
2421 : }
2422 : }
2423 :
2424 0 : RRETURN(MATCH_NOMATCH);
2425 : }
2426 : /* Control never gets here */
2427 : }
2428 :
2429 : /* Caseful comparisons */
2430 :
2431 : else
2432 : {
2433 : #ifdef SUPPORT_UTF8
2434 : /* UTF-8 mode */
2435 0 : if (utf8)
2436 : {
2437 : register unsigned int d;
2438 0 : for (i = 1; i <= min; i++)
2439 : {
2440 0 : GETCHARINC(d, eptr);
2441 0 : if (fc == d) RRETURN(MATCH_NOMATCH);
2442 : }
2443 : }
2444 : else
2445 : #endif
2446 : /* Not UTF-8 mode */
2447 : {
2448 0 : for (i = 1; i <= min; i++)
2449 0 : if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2450 : }
2451 :
2452 0 : if (min == max) continue;
2453 :
2454 0 : if (minimize)
2455 : {
2456 : #ifdef SUPPORT_UTF8
2457 : /* UTF-8 mode */
2458 0 : if (utf8)
2459 : {
2460 : register unsigned int d;
2461 0 : for (fi = min;; fi++)
2462 : {
2463 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2464 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2465 0 : GETCHARINC(d, eptr);
2466 0 : if (fi >= max || eptr >= md->end_subject || fc == d)
2467 0 : RRETURN(MATCH_NOMATCH);
2468 0 : }
2469 : }
2470 : else
2471 : #endif
2472 : /* Not UTF-8 mode */
2473 : {
2474 0 : for (fi = min;; fi++)
2475 : {
2476 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2477 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2478 0 : if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2479 0 : RRETURN(MATCH_NOMATCH);
2480 0 : }
2481 : }
2482 : /* Control never gets here */
2483 : }
2484 :
2485 : /* Maximize case */
2486 :
2487 : else
2488 : {
2489 0 : pp = eptr;
2490 :
2491 : #ifdef SUPPORT_UTF8
2492 : /* UTF-8 mode */
2493 0 : if (utf8)
2494 : {
2495 : register unsigned int d;
2496 0 : for (i = min; i < max; i++)
2497 : {
2498 0 : int len = 1;
2499 0 : if (eptr >= md->end_subject) break;
2500 0 : GETCHARLEN(d, eptr, len);
2501 0 : if (fc == d) break;
2502 0 : eptr += len;
2503 : }
2504 0 : if (possessive) continue;
2505 : for(;;)
2506 : {
2507 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2508 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2509 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
2510 0 : BACKCHAR(eptr);
2511 0 : }
2512 : }
2513 : else
2514 : #endif
2515 : /* Not UTF-8 mode */
2516 : {
2517 0 : for (i = min; i < max; i++)
2518 : {
2519 0 : if (eptr >= md->end_subject || fc == *eptr) break;
2520 0 : eptr++;
2521 : }
2522 0 : if (possessive) continue;
2523 0 : while (eptr >= pp)
2524 : {
2525 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2526 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2527 0 : eptr--;
2528 : }
2529 : }
2530 :
2531 0 : RRETURN(MATCH_NOMATCH);
2532 : }
2533 : }
2534 : /* Control never gets here */
2535 :
2536 : /* Match a single character type repeatedly; several different opcodes
2537 : share code. This is very similar to the code for single characters, but we
2538 : repeat it in the interests of efficiency. */
2539 :
2540 : case OP_TYPEEXACT:
2541 0 : min = max = GET2(ecode, 1);
2542 0 : minimize = TRUE;
2543 0 : ecode += 3;
2544 0 : goto REPEATTYPE;
2545 :
2546 : case OP_TYPEUPTO:
2547 : case OP_TYPEMINUPTO:
2548 0 : min = 0;
2549 0 : max = GET2(ecode, 1);
2550 0 : minimize = *ecode == OP_TYPEMINUPTO;
2551 0 : ecode += 3;
2552 0 : goto REPEATTYPE;
2553 :
2554 : case OP_TYPEPOSSTAR:
2555 1 : possessive = TRUE;
2556 1 : min = 0;
2557 1 : max = INT_MAX;
2558 1 : ecode++;
2559 1 : goto REPEATTYPE;
2560 :
2561 : case OP_TYPEPOSPLUS:
2562 0 : possessive = TRUE;
2563 0 : min = 1;
2564 0 : max = INT_MAX;
2565 0 : ecode++;
2566 0 : goto REPEATTYPE;
2567 :
2568 : case OP_TYPEPOSQUERY:
2569 0 : possessive = TRUE;
2570 0 : min = 0;
2571 0 : max = 1;
2572 0 : ecode++;
2573 0 : goto REPEATTYPE;
2574 :
2575 : case OP_TYPEPOSUPTO:
2576 0 : possessive = TRUE;
2577 0 : min = 0;
2578 0 : max = GET2(ecode, 1);
2579 0 : ecode += 3;
2580 0 : goto REPEATTYPE;
2581 :
2582 : case OP_TYPESTAR:
2583 : case OP_TYPEMINSTAR:
2584 : case OP_TYPEPLUS:
2585 : case OP_TYPEMINPLUS:
2586 : case OP_TYPEQUERY:
2587 : case OP_TYPEMINQUERY:
2588 418 : c = *ecode++ - OP_TYPESTAR;
2589 418 : minimize = (c & 1) != 0;
2590 418 : min = rep_min[c]; /* Pick up values from tables; */
2591 418 : max = rep_max[c]; /* zero for max => infinity */
2592 418 : if (max == 0) max = INT_MAX;
2593 :
2594 : /* Common code for all repeated single character type matches. Note that
2595 : in UTF-8 mode, '.' matches a character of any length, but for the other
2596 : character types, the valid characters are all one-byte long. */
2597 :
2598 419 : REPEATTYPE:
2599 419 : ctype = *ecode++; /* Code for the character type */
2600 :
2601 : #ifdef SUPPORT_UCP
2602 419 : if (ctype == OP_PROP || ctype == OP_NOTPROP)
2603 : {
2604 0 : prop_fail_result = ctype == OP_NOTPROP;
2605 0 : prop_type = *ecode++;
2606 0 : prop_value = *ecode++;
2607 : }
2608 419 : else prop_type = -1;
2609 : #endif
2610 :
2611 : /* First, ensure the minimum number of matches are present. Use inline
2612 : code for maximizing the speed, and do the type test once at the start
2613 : (i.e. keep it out of the loop). Also we can test that there are at least
2614 : the minimum number of bytes before we start. This isn't as effective in
2615 : UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2616 : is tidier. Also separate the UCP code, which can be the same for both UTF-8
2617 : and single-bytes. */
2618 :
2619 419 : if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2620 419 : if (min > 0)
2621 : {
2622 : #ifdef SUPPORT_UCP
2623 416 : if (prop_type >= 0)
2624 : {
2625 0 : switch(prop_type)
2626 : {
2627 : case PT_ANY:
2628 0 : if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2629 0 : for (i = 1; i <= min; i++)
2630 : {
2631 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2632 0 : GETCHARINC(c, eptr);
2633 : }
2634 0 : break;
2635 :
2636 : case PT_LAMP:
2637 0 : for (i = 1; i <= min; i++)
2638 : {
2639 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2640 0 : GETCHARINC(c, eptr);
2641 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2642 0 : if ((prop_chartype == ucp_Lu ||
2643 : prop_chartype == ucp_Ll ||
2644 : prop_chartype == ucp_Lt) == prop_fail_result)
2645 0 : RRETURN(MATCH_NOMATCH);
2646 : }
2647 0 : break;
2648 :
2649 : case PT_GC:
2650 0 : for (i = 1; i <= min; i++)
2651 : {
2652 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2653 0 : GETCHARINC(c, eptr);
2654 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2655 0 : if ((prop_category == prop_value) == prop_fail_result)
2656 0 : RRETURN(MATCH_NOMATCH);
2657 : }
2658 0 : break;
2659 :
2660 : case PT_PC:
2661 0 : for (i = 1; i <= min; i++)
2662 : {
2663 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2664 0 : GETCHARINC(c, eptr);
2665 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2666 0 : if ((prop_chartype == prop_value) == prop_fail_result)
2667 0 : RRETURN(MATCH_NOMATCH);
2668 : }
2669 0 : break;
2670 :
2671 : case PT_SC:
2672 0 : for (i = 1; i <= min; i++)
2673 : {
2674 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2675 0 : GETCHARINC(c, eptr);
2676 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2677 0 : if ((prop_script == prop_value) == prop_fail_result)
2678 0 : RRETURN(MATCH_NOMATCH);
2679 : }
2680 0 : break;
2681 :
2682 : default:
2683 0 : RRETURN(PCRE_ERROR_INTERNAL);
2684 : }
2685 : }
2686 :
2687 : /* Match extended Unicode sequences. We will get here only if the
2688 : support is in the binary; otherwise a compile-time error occurs. */
2689 :
2690 416 : else if (ctype == OP_EXTUNI)
2691 : {
2692 0 : for (i = 1; i <= min; i++)
2693 : {
2694 0 : GETCHARINCTEST(c, eptr);
2695 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2696 0 : if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2697 0 : while (eptr < md->end_subject)
2698 : {
2699 0 : int len = 1;
2700 0 : if (!utf8) c = *eptr; else
2701 : {
2702 0 : GETCHARLEN(c, eptr, len);
2703 : }
2704 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2705 0 : if (prop_category != ucp_M) break;
2706 0 : eptr += len;
2707 : }
2708 : }
2709 : }
2710 :
2711 : else
2712 : #endif /* SUPPORT_UCP */
2713 :
2714 : /* Handle all other cases when the coding is UTF-8 */
2715 :
2716 : #ifdef SUPPORT_UTF8
2717 416 : if (utf8) switch(ctype)
2718 : {
2719 : case OP_ANY:
2720 0 : for (i = 1; i <= min; i++)
2721 : {
2722 0 : if (eptr >= md->end_subject ||
2723 : ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2724 0 : RRETURN(MATCH_NOMATCH);
2725 0 : eptr++;
2726 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2727 : }
2728 0 : break;
2729 :
2730 : case OP_ANYBYTE:
2731 0 : eptr += min;
2732 0 : break;
2733 :
2734 : case OP_ANYNL:
2735 0 : for (i = 1; i <= min; i++)
2736 : {
2737 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2738 0 : GETCHARINC(c, eptr);
2739 0 : switch(c)
2740 : {
2741 0 : default: RRETURN(MATCH_NOMATCH);
2742 : case 0x000d:
2743 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2744 : break;
2745 : case 0x000a:
2746 : case 0x000b:
2747 : case 0x000c:
2748 : case 0x0085:
2749 : case 0x2028:
2750 : case 0x2029:
2751 : break;
2752 : }
2753 : }
2754 0 : break;
2755 :
2756 : case OP_NOT_DIGIT:
2757 0 : for (i = 1; i <= min; i++)
2758 : {
2759 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2760 0 : GETCHARINC(c, eptr);
2761 0 : if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2762 0 : RRETURN(MATCH_NOMATCH);
2763 : }
2764 0 : break;
2765 :
2766 : case OP_DIGIT:
2767 0 : for (i = 1; i <= min; i++)
2768 : {
2769 0 : if (eptr >= md->end_subject ||
2770 : *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2771 0 : RRETURN(MATCH_NOMATCH);
2772 : /* No need to skip more bytes - we know it's a 1-byte character */
2773 : }
2774 0 : break;
2775 :
2776 : case OP_NOT_WHITESPACE:
2777 0 : for (i = 1; i <= min; i++)
2778 : {
2779 0 : if (eptr >= md->end_subject ||
2780 : (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2781 0 : RRETURN(MATCH_NOMATCH);
2782 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2783 : }
2784 0 : break;
2785 :
2786 : case OP_WHITESPACE:
2787 0 : for (i = 1; i <= min; i++)
2788 : {
2789 0 : if (eptr >= md->end_subject ||
2790 : *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2791 0 : RRETURN(MATCH_NOMATCH);
2792 : /* No need to skip more bytes - we know it's a 1-byte character */
2793 : }
2794 0 : break;
2795 :
2796 : case OP_NOT_WORDCHAR:
2797 0 : for (i = 1; i <= min; i++)
2798 : {
2799 0 : if (eptr >= md->end_subject ||
2800 : (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2801 0 : RRETURN(MATCH_NOMATCH);
2802 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2803 : }
2804 0 : break;
2805 :
2806 : case OP_WORDCHAR:
2807 0 : for (i = 1; i <= min; i++)
2808 : {
2809 0 : if (eptr >= md->end_subject ||
2810 : *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2811 0 : RRETURN(MATCH_NOMATCH);
2812 : /* No need to skip more bytes - we know it's a 1-byte character */
2813 : }
2814 0 : break;
2815 :
2816 : default:
2817 0 : RRETURN(PCRE_ERROR_INTERNAL);
2818 : } /* End switch(ctype) */
2819 :
2820 : else
2821 : #endif /* SUPPORT_UTF8 */
2822 :
2823 : /* Code for the non-UTF-8 case for minimum matching of operators other
2824 : than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2825 : number of bytes present, as this was tested above. */
2826 :
2827 416 : switch(ctype)
2828 : {
2829 : case OP_ANY:
2830 416 : if ((ims & PCRE_DOTALL) == 0)
2831 : {
2832 2 : for (i = 1; i <= min; i++)
2833 : {
2834 1 : if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2835 1 : eptr++;
2836 : }
2837 : }
2838 415 : else eptr += min;
2839 416 : break;
2840 :
2841 : case OP_ANYBYTE:
2842 0 : eptr += min;
2843 0 : break;
2844 :
2845 : /* Because of the CRLF case, we can't assume the minimum number of
2846 : bytes are present in this case. */
2847 :
2848 : case OP_ANYNL:
2849 0 : for (i = 1; i <= min; i++)
2850 : {
2851 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2852 0 : switch(*eptr++)
2853 : {
2854 0 : default: RRETURN(MATCH_NOMATCH);
2855 : case 0x000d:
2856 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2857 : break;
2858 : case 0x000a:
2859 : case 0x000b:
2860 : case 0x000c:
2861 : case 0x0085:
2862 : break;
2863 : }
2864 : }
2865 0 : break;
2866 :
2867 : case OP_NOT_DIGIT:
2868 0 : for (i = 1; i <= min; i++)
2869 0 : if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2870 0 : break;
2871 :
2872 : case OP_DIGIT:
2873 0 : for (i = 1; i <= min; i++)
2874 0 : if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2875 0 : break;
2876 :
2877 : case OP_NOT_WHITESPACE:
2878 0 : for (i = 1; i <= min; i++)
2879 0 : if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2880 0 : break;
2881 :
2882 : case OP_WHITESPACE:
2883 0 : for (i = 1; i <= min; i++)
2884 0 : if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2885 0 : break;
2886 :
2887 : case OP_NOT_WORDCHAR:
2888 0 : for (i = 1; i <= min; i++)
2889 0 : if ((md->ctypes[*eptr++] & ctype_word) != 0)
2890 0 : RRETURN(MATCH_NOMATCH);
2891 0 : break;
2892 :
2893 : case OP_WORDCHAR:
2894 0 : for (i = 1; i <= min; i++)
2895 0 : if ((md->ctypes[*eptr++] & ctype_word) == 0)
2896 0 : RRETURN(MATCH_NOMATCH);
2897 0 : break;
2898 :
2899 : default:
2900 0 : RRETURN(PCRE_ERROR_INTERNAL);
2901 : }
2902 : }
2903 :
2904 : /* If min = max, continue at the same level without recursing */
2905 :
2906 419 : if (min == max) continue;
2907 :
2908 : /* If minimizing, we have to test the rest of the pattern before each
2909 : subsequent match. Again, separate the UTF-8 case for speed, and also
2910 : separate the UCP cases. */
2911 :
2912 419 : if (minimize)
2913 : {
2914 : #ifdef SUPPORT_UCP
2915 414 : if (prop_type >= 0)
2916 : {
2917 0 : switch(prop_type)
2918 : {
2919 : case PT_ANY:
2920 0 : for (fi = min;; fi++)
2921 : {
2922 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2923 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2924 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2925 0 : GETCHARINC(c, eptr);
2926 0 : if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2927 0 : }
2928 : /* Control never gets here */
2929 :
2930 : case PT_LAMP:
2931 0 : for (fi = min;; fi++)
2932 : {
2933 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2934 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2935 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2936 0 : GETCHARINC(c, eptr);
2937 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2938 0 : if ((prop_chartype == ucp_Lu ||
2939 : prop_chartype == ucp_Ll ||
2940 : prop_chartype == ucp_Lt) == prop_fail_result)
2941 0 : RRETURN(MATCH_NOMATCH);
2942 0 : }
2943 : /* Control never gets here */
2944 :
2945 : case PT_GC:
2946 0 : for (fi = min;; fi++)
2947 : {
2948 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2949 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2950 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951 0 : GETCHARINC(c, eptr);
2952 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2953 0 : if ((prop_category == prop_value) == prop_fail_result)
2954 0 : RRETURN(MATCH_NOMATCH);
2955 0 : }
2956 : /* Control never gets here */
2957 :
2958 : case PT_PC:
2959 0 : for (fi = min;; fi++)
2960 : {
2961 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2962 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2963 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2964 0 : GETCHARINC(c, eptr);
2965 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2966 0 : if ((prop_chartype == prop_value) == prop_fail_result)
2967 0 : RRETURN(MATCH_NOMATCH);
2968 0 : }
2969 : /* Control never gets here */
2970 :
2971 : case PT_SC:
2972 0 : for (fi = min;; fi++)
2973 : {
2974 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2975 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2976 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2977 0 : GETCHARINC(c, eptr);
2978 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2979 0 : if ((prop_script == prop_value) == prop_fail_result)
2980 0 : RRETURN(MATCH_NOMATCH);
2981 0 : }
2982 : /* Control never gets here */
2983 :
2984 : default:
2985 0 : RRETURN(PCRE_ERROR_INTERNAL);
2986 : }
2987 : }
2988 :
2989 : /* Match extended Unicode sequences. We will get here only if the
2990 : support is in the binary; otherwise a compile-time error occurs. */
2991 :
2992 414 : else if (ctype == OP_EXTUNI)
2993 : {
2994 0 : for (fi = min;; fi++)
2995 : {
2996 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2997 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2998 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2999 0 : GETCHARINCTEST(c, eptr);
3000 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3001 0 : if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3002 0 : while (eptr < md->end_subject)
3003 : {
3004 0 : int len = 1;
3005 0 : if (!utf8) c = *eptr; else
3006 : {
3007 0 : GETCHARLEN(c, eptr, len);
3008 : }
3009 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3010 0 : if (prop_category != ucp_M) break;
3011 0 : eptr += len;
3012 : }
3013 0 : }
3014 : }
3015 :
3016 : else
3017 : #endif /* SUPPORT_UCP */
3018 :
3019 : #ifdef SUPPORT_UTF8
3020 : /* UTF-8 mode */
3021 414 : if (utf8)
3022 : {
3023 0 : for (fi = min;; fi++)
3024 : {
3025 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3026 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3027 0 : if (fi >= max || eptr >= md->end_subject ||
3028 : (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3029 : IS_NEWLINE(eptr)))
3030 0 : RRETURN(MATCH_NOMATCH);
3031 :
3032 0 : GETCHARINC(c, eptr);
3033 0 : switch(ctype)
3034 : {
3035 : case OP_ANY: /* This is the DOTALL case */
3036 0 : break;
3037 :
3038 : case OP_ANYBYTE:
3039 0 : break;
3040 :
3041 : case OP_ANYNL:
3042 0 : switch(c)
3043 : {
3044 0 : default: RRETURN(MATCH_NOMATCH);
3045 : case 0x000d:
3046 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3047 : break;
3048 : case 0x000a:
3049 : case 0x000b:
3050 : case 0x000c:
3051 : case 0x0085:
3052 : case 0x2028:
3053 : case 0x2029:
3054 : break;
3055 : }
3056 0 : break;
3057 :
3058 : case OP_NOT_DIGIT:
3059 0 : if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3060 0 : RRETURN(MATCH_NOMATCH);
3061 0 : break;
3062 :
3063 : case OP_DIGIT:
3064 0 : if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3065 0 : RRETURN(MATCH_NOMATCH);
3066 0 : break;
3067 :
3068 : case OP_NOT_WHITESPACE:
3069 0 : if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3070 0 : RRETURN(MATCH_NOMATCH);
3071 0 : break;
3072 :
3073 : case OP_WHITESPACE:
3074 0 : if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3075 0 : RRETURN(MATCH_NOMATCH);
3076 0 : break;
3077 :
3078 : case OP_NOT_WORDCHAR:
3079 0 : if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3080 0 : RRETURN(MATCH_NOMATCH);
3081 0 : break;
3082 :
3083 : case OP_WORDCHAR:
3084 0 : if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3085 0 : RRETURN(MATCH_NOMATCH);
3086 0 : break;
3087 :
3088 : default:
3089 0 : RRETURN(PCRE_ERROR_INTERNAL);
3090 : }
3091 0 : }
3092 : }
3093 : else
3094 : #endif
3095 : /* Not UTF-8 mode */
3096 : {
3097 15368 : for (fi = min;; fi++)
3098 : {
3099 15368 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3100 15368 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3101 15000 : if (fi >= max || eptr >= md->end_subject ||
3102 : ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3103 46 : RRETURN(MATCH_NOMATCH);
3104 :
3105 14954 : c = *eptr++;
3106 14954 : switch(ctype)
3107 : {
3108 : case OP_ANY: /* This is the DOTALL case */
3109 14954 : break;
3110 :
3111 : case OP_ANYBYTE:
3112 0 : break;
3113 :
3114 : case OP_ANYNL:
3115 0 : switch(c)
3116 : {
3117 0 : default: RRETURN(MATCH_NOMATCH);
3118 : case 0x000d:
3119 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3120 : break;
3121 : case 0x000a:
3122 : case 0x000b:
3123 : case 0x000c:
3124 : case 0x0085:
3125 : break;
3126 : }
3127 0 : break;
3128 :
3129 : case OP_NOT_DIGIT:
3130 0 : if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3131 0 : break;
3132 :
3133 : case OP_DIGIT:
3134 0 : if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3135 0 : break;
3136 :
3137 : case OP_NOT_WHITESPACE:
3138 0 : if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3139 0 : break;
3140 :
3141 : case OP_WHITESPACE:
3142 0 : if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3143 0 : break;
3144 :
3145 : case OP_NOT_WORDCHAR:
3146 0 : if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3147 0 : break;
3148 :
3149 : case OP_WORDCHAR:
3150 0 : if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3151 0 : break;
3152 :
3153 : default:
3154 0 : RRETURN(PCRE_ERROR_INTERNAL);
3155 : }
3156 14954 : }
3157 : }
3158 : /* Control never gets here */
3159 : }
3160 :
3161 : /* If maximizing, it is worth using inline code for speed, doing the type
3162 : test once at the start (i.e. keep it out of the loop). Again, keep the
3163 : UTF-8 and UCP stuff separate. */
3164 :
3165 : else
3166 : {
3167 5 : pp = eptr; /* Remember where we started */
3168 :
3169 : #ifdef SUPPORT_UCP
3170 5 : if (prop_type >= 0)
3171 : {
3172 0 : switch(prop_type)
3173 : {
3174 : case PT_ANY:
3175 0 : for (i = min; i < max; i++)
3176 : {
3177 0 : int len = 1;
3178 0 : if (eptr >= md->end_subject) break;
3179 0 : GETCHARLEN(c, eptr, len);
3180 0 : if (prop_fail_result) break;
3181 0 : eptr+= len;
3182 : }
3183 0 : break;
3184 :
3185 : case PT_LAMP:
3186 0 : for (i = min; i < max; i++)
3187 : {
3188 0 : int len = 1;
3189 0 : if (eptr >= md->end_subject) break;
3190 0 : GETCHARLEN(c, eptr, len);
3191 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3192 0 : if ((prop_chartype == ucp_Lu ||
3193 : prop_chartype == ucp_Ll ||
3194 : prop_chartype == ucp_Lt) == prop_fail_result)
3195 0 : break;
3196 0 : eptr+= len;
3197 : }
3198 0 : break;
3199 :
3200 : case PT_GC:
3201 0 : for (i = min; i < max; i++)
3202 : {
3203 0 : int len = 1;
3204 0 : if (eptr >= md->end_subject) break;
3205 0 : GETCHARLEN(c, eptr, len);
3206 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3207 0 : if ((prop_category == prop_value) == prop_fail_result)
3208 0 : break;
3209 0 : eptr+= len;
3210 : }
3211 0 : break;
3212 :
3213 : case PT_PC:
3214 0 : for (i = min; i < max; i++)
3215 : {
3216 0 : int len = 1;
3217 0 : if (eptr >= md->end_subject) break;
3218 0 : GETCHARLEN(c, eptr, len);
3219 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3220 0 : if ((prop_chartype == prop_value) == prop_fail_result)
3221 0 : break;
3222 0 : eptr+= len;
3223 : }
3224 0 : break;
3225 :
3226 : case PT_SC:
3227 0 : for (i = min; i < max; i++)
3228 : {
3229 0 : int len = 1;
3230 0 : if (eptr >= md->end_subject) break;
3231 0 : GETCHARLEN(c, eptr, len);
3232 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3233 0 : if ((prop_script == prop_value) == prop_fail_result)
3234 0 : break;
3235 0 : eptr+= len;
3236 : }
3237 : break;
3238 : }
3239 :
3240 : /* eptr is now past the end of the maximum run */
3241 :
3242 0 : if (possessive) continue;
3243 : for(;;)
3244 : {
3245 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3246 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3247 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
3248 0 : BACKCHAR(eptr);
3249 0 : }
3250 : }
3251 :
3252 : /* Match extended Unicode sequences. We will get here only if the
3253 : support is in the binary; otherwise a compile-time error occurs. */
3254 :
3255 5 : else if (ctype == OP_EXTUNI)
3256 : {
3257 0 : for (i = min; i < max; i++)
3258 : {
3259 0 : if (eptr >= md->end_subject) break;
3260 0 : GETCHARINCTEST(c, eptr);
3261 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3262 0 : if (prop_category == ucp_M) break;
3263 0 : while (eptr < md->end_subject)
3264 : {
3265 0 : int len = 1;
3266 0 : if (!utf8) c = *eptr; else
3267 : {
3268 0 : GETCHARLEN(c, eptr, len);
3269 : }
3270 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3271 0 : if (prop_category != ucp_M) break;
3272 0 : eptr += len;
3273 : }
3274 : }
3275 :
3276 : /* eptr is now past the end of the maximum run */
3277 :
3278 0 : if (possessive) continue;
3279 : for(;;)
3280 : {
3281 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3282 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3283 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
3284 : for (;;) /* Move back over one extended */
3285 : {
3286 0 : int len = 1;
3287 0 : BACKCHAR(eptr);
3288 0 : if (!utf8) c = *eptr; else
3289 : {
3290 0 : GETCHARLEN(c, eptr, len);
3291 : }
3292 0 : prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3293 0 : if (prop_category != ucp_M) break;
3294 0 : eptr--;
3295 0 : }
3296 0 : }
3297 : }
3298 :
3299 : else
3300 : #endif /* SUPPORT_UCP */
3301 :
3302 : #ifdef SUPPORT_UTF8
3303 : /* UTF-8 mode */
3304 :
3305 5 : if (utf8)
3306 : {
3307 0 : switch(ctype)
3308 : {
3309 : case OP_ANY:
3310 :
3311 : /* Special code is required for UTF8, but when the maximum is
3312 : unlimited we don't need it, so we repeat the non-UTF8 code. This is
3313 : probably worth it, because .* is quite a common idiom. */
3314 :
3315 0 : if (max < INT_MAX)
3316 : {
3317 0 : if ((ims & PCRE_DOTALL) == 0)
3318 : {
3319 0 : for (i = min; i < max; i++)
3320 : {
3321 0 : if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3322 0 : eptr++;
3323 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3324 : }
3325 : }
3326 : else
3327 : {
3328 0 : for (i = min; i < max; i++)
3329 : {
3330 0 : if (eptr >= md->end_subject) break;
3331 0 : eptr++;
3332 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3333 : }
3334 : }
3335 : }
3336 :
3337 : /* Handle unlimited UTF-8 repeat */
3338 :
3339 : else
3340 : {
3341 0 : if ((ims & PCRE_DOTALL) == 0)
3342 : {
3343 0 : for (i = min; i < max; i++)
3344 : {
3345 0 : if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3346 0 : eptr++;
3347 : }
3348 0 : break;
3349 : }
3350 : else
3351 : {
3352 0 : c = max - min;
3353 0 : if (c > (unsigned int)(md->end_subject - eptr))
3354 0 : c = md->end_subject - eptr;
3355 0 : eptr += c;
3356 : }
3357 : }
3358 0 : break;
3359 :
3360 : /* The byte case is the same as non-UTF8 */
3361 :
3362 : case OP_ANYBYTE:
3363 0 : c = max - min;
3364 0 : if (c > (unsigned int)(md->end_subject - eptr))
3365 0 : c = md->end_subject - eptr;
3366 0 : eptr += c;
3367 0 : break;
3368 :
3369 : case OP_ANYNL:
3370 0 : for (i = min; i < max; i++)
3371 : {
3372 0 : int len = 1;
3373 0 : if (eptr >= md->end_subject) break;
3374 0 : GETCHARLEN(c, eptr, len);
3375 0 : if (c == 0x000d)
3376 : {
3377 0 : if (++eptr >= md->end_subject) break;
3378 0 : if (*eptr == 0x000a) eptr++;
3379 : }
3380 : else
3381 : {
3382 0 : if (c != 0x000a && c != 0x000b && c != 0x000c &&
3383 : c != 0x0085 && c != 0x2028 && c != 0x2029)
3384 0 : break;
3385 0 : eptr += len;
3386 : }
3387 : }
3388 0 : break;
3389 :
3390 : case OP_NOT_DIGIT:
3391 0 : for (i = min; i < max; i++)
3392 : {
3393 0 : int len = 1;
3394 0 : if (eptr >= md->end_subject) break;
3395 0 : GETCHARLEN(c, eptr, len);
3396 0 : if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3397 0 : eptr+= len;
3398 : }
3399 0 : break;
3400 :
3401 : case OP_DIGIT:
3402 0 : for (i = min; i < max; i++)
3403 : {
3404 0 : int len = 1;
3405 0 : if (eptr >= md->end_subject) break;
3406 0 : GETCHARLEN(c, eptr, len);
3407 0 : if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3408 0 : eptr+= len;
3409 : }
3410 0 : break;
3411 :
3412 : case OP_NOT_WHITESPACE:
3413 0 : for (i = min; i < max; i++)
3414 : {
3415 0 : int len = 1;
3416 0 : if (eptr >= md->end_subject) break;
3417 0 : GETCHARLEN(c, eptr, len);
3418 0 : if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3419 0 : eptr+= len;
3420 : }
3421 0 : break;
3422 :
3423 : case OP_WHITESPACE:
3424 0 : for (i = min; i < max; i++)
3425 : {
3426 0 : int len = 1;
3427 0 : if (eptr >= md->end_subject) break;
3428 0 : GETCHARLEN(c, eptr, len);
3429 0 : if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3430 0 : eptr+= len;
3431 : }
3432 0 : break;
3433 :
3434 : case OP_NOT_WORDCHAR:
3435 0 : for (i = min; i < max; i++)
3436 : {
3437 0 : int len = 1;
3438 0 : if (eptr >= md->end_subject) break;
3439 0 : GETCHARLEN(c, eptr, len);
3440 0 : if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3441 0 : eptr+= len;
3442 : }
3443 0 : break;
3444 :
3445 : case OP_WORDCHAR:
3446 0 : for (i = min; i < max; i++)
3447 : {
3448 0 : int len = 1;
3449 0 : if (eptr >= md->end_subject) break;
3450 0 : GETCHARLEN(c, eptr, len);
3451 0 : if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3452 0 : eptr+= len;
3453 : }
3454 0 : break;
3455 :
3456 : default:
3457 0 : RRETURN(PCRE_ERROR_INTERNAL);
3458 : }
3459 :
3460 : /* eptr is now past the end of the maximum run */
3461 :
3462 0 : if (possessive) continue;
3463 : for(;;)
3464 : {
3465 0 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3466 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
3468 0 : BACKCHAR(eptr);
3469 0 : }
3470 : }
3471 : else
3472 : #endif
3473 :
3474 : /* Not UTF-8 mode */
3475 : {
3476 5 : switch(ctype)
3477 : {
3478 : case OP_ANY:
3479 2 : if ((ims & PCRE_DOTALL) == 0)
3480 : {
3481 15 : for (i = min; i < max; i++)
3482 : {
3483 15 : if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3484 14 : eptr++;
3485 : }
3486 1 : break;
3487 : }
3488 : /* For DOTALL case, fall through and treat as \C */
3489 :
3490 : case OP_ANYBYTE:
3491 1 : c = max - min;
3492 1 : if (c > (unsigned int)(md->end_subject - eptr))
3493 1 : c = md->end_subject - eptr;
3494 1 : eptr += c;
3495 1 : break;
3496 :
3497 : case OP_ANYNL:
3498 0 : for (i = min; i < max; i++)
3499 : {
3500 0 : if (eptr >= md->end_subject) break;
3501 0 : c = *eptr;
3502 0 : if (c == 0x000d)
3503 : {
3504 0 : if (++eptr >= md->end_subject) break;
3505 0 : if (*eptr == 0x000a) eptr++;
3506 : }
3507 : else
3508 : {
3509 0 : if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3510 0 : break;
3511 0 : eptr++;
3512 : }
3513 : }
3514 0 : break;
3515 :
3516 : case OP_NOT_DIGIT:
3517 0 : for (i = min; i < max; i++)
3518 : {
3519 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3520 : break;
3521 0 : eptr++;
3522 : }
3523 0 : break;
3524 :
3525 : case OP_DIGIT:
3526 0 : for (i = min; i < max; i++)
3527 : {
3528 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3529 : break;
3530 0 : eptr++;
3531 : }
3532 0 : break;
3533 :
3534 : case OP_NOT_WHITESPACE:
3535 0 : for (i = min; i < max; i++)
3536 : {
3537 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3538 : break;
3539 0 : eptr++;
3540 : }
3541 0 : break;
3542 :
3543 : case OP_WHITESPACE:
3544 4 : for (i = min; i < max; i++)
3545 : {
3546 4 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3547 : break;
3548 1 : eptr++;
3549 : }
3550 3 : break;
3551 :
3552 : case OP_NOT_WORDCHAR:
3553 0 : for (i = min; i < max; i++)
3554 : {
3555 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3556 : break;
3557 0 : eptr++;
3558 : }
3559 0 : break;
3560 :
3561 : case OP_WORDCHAR:
3562 0 : for (i = min; i < max; i++)
3563 : {
3564 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3565 : break;
3566 0 : eptr++;
3567 : }
3568 0 : break;
3569 :
3570 : default:
3571 0 : RRETURN(PCRE_ERROR_INTERNAL);
3572 : }
3573 :
3574 : /* eptr is now past the end of the maximum run */
3575 :
3576 5 : if (possessive) continue;
3577 38 : while (eptr >= pp)
3578 : {
3579 34 : RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3580 34 : eptr--;
3581 34 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3582 : }
3583 : }
3584 :
3585 : /* Get here if we can't make it match with any permitted repetitions */
3586 :
3587 0 : RRETURN(MATCH_NOMATCH);
3588 : }
3589 : /* Control never gets here */
3590 :
3591 : /* There's been some horrible disaster. Arrival here can only mean there is
3592 : something seriously wrong in the code above or the OP_xxx definitions. */
3593 :
3594 : default:
3595 : DPRINTF(("Unknown opcode %d\n", *ecode));
3596 0 : RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3597 : }
3598 :
3599 : /* Do not stick any code in here without much thought; it is assumed
3600 : that "continue" in the code above comes out to here to repeat the main
3601 : loop. */
3602 :
3603 149784 : } /* End of main loop */
3604 : /* Control never reaches here */
3605 : }
3606 :
3607 :
3608 : /***************************************************************************
3609 : ****************************************************************************
3610 : RECURSION IN THE match() FUNCTION
3611 :
3612 : Undefine all the macros that were defined above to handle this. */
3613 :
3614 : #ifdef NO_RECURSE
3615 : #undef eptr
3616 : #undef ecode
3617 : #undef offset_top
3618 : #undef ims
3619 : #undef eptrb
3620 : #undef flags
3621 :
3622 : #undef callpat
3623 : #undef charptr
3624 : #undef data
3625 : #undef next
3626 : #undef pp
3627 : #undef prev
3628 : #undef saved_eptr
3629 :
3630 : #undef new_recursive
3631 :
3632 : #undef cur_is_word
3633 : #undef condition
3634 : #undef prev_is_word
3635 :
3636 : #undef original_ims
3637 :
3638 : #undef ctype
3639 : #undef length
3640 : #undef max
3641 : #undef min
3642 : #undef number
3643 : #undef offset
3644 : #undef op
3645 : #undef save_capture_last
3646 : #undef save_offset1
3647 : #undef save_offset2
3648 : #undef save_offset3
3649 : #undef stacksave
3650 :
3651 : #undef newptrb
3652 :
3653 : #endif
3654 :
3655 : /* These two are defined as macros in both cases */
3656 :
3657 : #undef fc
3658 : #undef fi
3659 :
3660 : /***************************************************************************
3661 : ***************************************************************************/
3662 :
3663 :
3664 :
3665 : /*************************************************
3666 : * Execute a Regular Expression *
3667 : *************************************************/
3668 :
3669 : /* This function applies a compiled re to a subject string and picks out
3670 : portions of the string if it matches. Two elements in the vector are set for
3671 : each substring: the offsets to the start and end of the substring.
3672 :
3673 : Arguments:
3674 : argument_re points to the compiled expression
3675 : extra_data points to extra data or is NULL
3676 : subject points to the subject string
3677 : length length of subject string (may contain binary zeros)
3678 : start_offset where to start in the subject string
3679 : options option bits
3680 : offsets points to a vector of ints to be filled in with offsets
3681 : offsetcount the number of elements in the vector
3682 :
3683 : Returns: > 0 => success; value is the number of elements filled in
3684 : = 0 => success, but offsets is not big enough
3685 : -1 => failed to match
3686 : < -1 => some kind of unexpected problem
3687 : */
3688 :
3689 : PCRE_DATA_SCOPE int
3690 : pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3691 : PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3692 : int offsetcount)
3693 21494 : {
3694 : int rc, resetcount, ocount;
3695 21494 : int first_byte = -1;
3696 21494 : int req_byte = -1;
3697 21494 : int req_byte2 = -1;
3698 : int newline;
3699 : unsigned long int ims;
3700 21494 : BOOL using_temporary_offsets = FALSE;
3701 : BOOL anchored;
3702 : BOOL startline;
3703 : BOOL firstline;
3704 21494 : BOOL first_byte_caseless = FALSE;
3705 21494 : BOOL req_byte_caseless = FALSE;
3706 : BOOL utf8;
3707 : match_data match_block;
3708 21494 : match_data *md = &match_block;
3709 : const uschar *tables;
3710 21494 : const uschar *start_bits = NULL;
3711 21494 : USPTR start_match = (USPTR)subject + start_offset;
3712 : USPTR end_subject;
3713 21494 : USPTR req_byte_ptr = start_match - 1;
3714 : eptrblock eptrchain[EPTR_WORK_SIZE];
3715 :
3716 : pcre_study_data internal_study;
3717 : const pcre_study_data *study;
3718 :
3719 : real_pcre internal_re;
3720 21494 : const real_pcre *external_re = (const real_pcre *)argument_re;
3721 21494 : const real_pcre *re = external_re;
3722 :
3723 : /* Plausibility checks */
3724 :
3725 21494 : if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3726 21494 : if (re == NULL || subject == NULL ||
3727 0 : (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3728 21494 : if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3729 :
3730 : /* Fish out the optional data from the extra_data structure, first setting
3731 : the default values. */
3732 :
3733 21494 : study = NULL;
3734 21494 : md->match_limit = MATCH_LIMIT;
3735 21494 : md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3736 21494 : md->callout_data = NULL;
3737 :
3738 : /* The table pointer is always in native byte order. */
3739 :
3740 21494 : tables = external_re->tables;
3741 :
3742 21494 : if (extra_data != NULL)
3743 : {
3744 21494 : register unsigned int flags = extra_data->flags;
3745 21494 : if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3746 0 : study = (const pcre_study_data *)extra_data->study_data;
3747 21494 : if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3748 21494 : md->match_limit = extra_data->match_limit;
3749 21494 : if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3750 21494 : md->match_limit_recursion = extra_data->match_limit_recursion;
3751 21494 : if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3752 0 : md->callout_data = extra_data->callout_data;
3753 21494 : if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3754 : }
3755 :
3756 : /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3757 : is a feature that makes it possible to save compiled regex and re-use them
3758 : in other programs later. */
3759 :
3760 21494 : if (tables == NULL) tables = _pcre_default_tables;
3761 :
3762 : /* Check that the first field in the block is the magic number. If it is not,
3763 : test for a regex that was compiled on a host of opposite endianness. If this is
3764 : the case, flipped values are put in internal_re and internal_study if there was
3765 : study data too. */
3766 :
3767 21494 : if (re->magic_number != MAGIC_NUMBER)
3768 : {
3769 0 : re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3770 0 : if (re == NULL) return PCRE_ERROR_BADMAGIC;
3771 0 : if (study != NULL) study = &internal_study;
3772 : }
3773 :
3774 : /* Set up other data */
3775 :
3776 21494 : anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3777 21494 : startline = (re->options & PCRE_STARTLINE) != 0;
3778 21494 : firstline = (re->options & PCRE_FIRSTLINE) != 0;
3779 :
3780 : /* The code starts after the real_pcre block and the capture name table. */
3781 :
3782 21494 : md->start_code = (const uschar *)external_re + re->name_table_offset +
3783 : re->name_count * re->name_entry_size;
3784 :
3785 21494 : md->start_subject = (USPTR)subject;
3786 21494 : md->start_offset = start_offset;
3787 21494 : md->end_subject = md->start_subject + length;
3788 21494 : end_subject = md->end_subject;
3789 :
3790 21494 : md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3791 21494 : utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3792 :
3793 21494 : md->notbol = (options & PCRE_NOTBOL) != 0;
3794 21494 : md->noteol = (options & PCRE_NOTEOL) != 0;
3795 21494 : md->notempty = (options & PCRE_NOTEMPTY) != 0;
3796 21494 : md->partial = (options & PCRE_PARTIAL) != 0;
3797 21494 : md->hitend = FALSE;
3798 :
3799 21494 : md->recursive = NULL; /* No recursion at top level */
3800 21494 : md->eptrchain = eptrchain; /* Make workspace generally available */
3801 :
3802 21494 : md->lcc = tables + lcc_offset;
3803 21494 : md->ctypes = tables + ctypes_offset;
3804 :
3805 : /* Handle different types of newline. The two bits give four cases. If nothing
3806 : is set at run time, whatever was used at compile time applies. */
3807 :
3808 : switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
3809 21494 : PCRE_NEWLINE_BITS)
3810 : {
3811 21494 : case 0: newline = NEWLINE; break; /* Compile-time default */
3812 0 : case PCRE_NEWLINE_CR: newline = '\r'; break;
3813 0 : case PCRE_NEWLINE_LF: newline = '\n'; break;
3814 : case PCRE_NEWLINE_CR+
3815 0 : PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3816 0 : case PCRE_NEWLINE_ANY: newline = -1; break;
3817 0 : default: return PCRE_ERROR_BADNEWLINE;
3818 : }
3819 :
3820 21494 : if (newline < 0)
3821 : {
3822 0 : md->nltype = NLTYPE_ANY;
3823 : }
3824 : else
3825 : {
3826 21494 : md->nltype = NLTYPE_FIXED;
3827 21494 : if (newline > 255)
3828 : {
3829 0 : md->nllen = 2;
3830 0 : md->nl[0] = (newline >> 8) & 255;
3831 0 : md->nl[1] = newline & 255;
3832 : }
3833 : else
3834 : {
3835 21494 : md->nllen = 1;
3836 21494 : md->nl[0] = newline;
3837 : }
3838 : }
3839 :
3840 : /* Partial matching is supported only for a restricted set of regexes at the
3841 : moment. */
3842 :
3843 21494 : if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3844 0 : return PCRE_ERROR_BADPARTIAL;
3845 :
3846 : /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3847 : back the character offset. */
3848 :
3849 : #ifdef SUPPORT_UTF8
3850 21494 : if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3851 : {
3852 0 : if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3853 0 : return PCRE_ERROR_BADUTF8;
3854 0 : if (start_offset > 0 && start_offset < length)
3855 : {
3856 0 : int tb = ((uschar *)subject)[start_offset];
3857 0 : if (tb > 127)
3858 : {
3859 0 : tb &= 0xc0;
3860 0 : if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3861 : }
3862 : }
3863 : }
3864 : #endif
3865 :
3866 : /* The ims options can vary during the matching as a result of the presence
3867 : of (?ims) items in the pattern. They are kept in a local variable so that
3868 : restoring at the exit of a group is easy. */
3869 :
3870 21494 : ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3871 :
3872 : /* If the expression has got more back references than the offsets supplied can
3873 : hold, we get a temporary chunk of working store to use during the matching.
3874 : Otherwise, we can use the vector supplied, rounding down its size to a multiple
3875 : of 3. */
3876 :
3877 21494 : ocount = offsetcount - (offsetcount % 3);
3878 :
3879 21494 : if (re->top_backref > 0 && re->top_backref >= ocount/3)
3880 : {
3881 0 : ocount = re->top_backref * 3 + 3;
3882 0 : md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3883 0 : if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3884 0 : using_temporary_offsets = TRUE;
3885 : DPRINTF(("Got memory to hold back references\n"));
3886 : }
3887 21494 : else md->offset_vector = offsets;
3888 :
3889 21494 : md->offset_end = ocount;
3890 21494 : md->offset_max = (2*ocount)/3;
3891 21494 : md->offset_overflow = FALSE;
3892 21494 : md->capture_last = -1;
3893 :
3894 : /* Compute the minimum number of offsets that we need to reset each time. Doing
3895 : this makes a huge difference to execution time when there aren't many brackets
3896 : in the pattern. */
3897 :
3898 21494 : resetcount = 2 + re->top_bracket * 2;
3899 21494 : if (resetcount > offsetcount) resetcount = ocount;
3900 :
3901 : /* Reset the working variable associated with each extraction. These should
3902 : never be used unless previously set, but they get saved and restored, and so we
3903 : initialize them to avoid reading uninitialized locations. */
3904 :
3905 21494 : if (md->offset_vector != NULL)
3906 : {
3907 21494 : register int *iptr = md->offset_vector + ocount;
3908 21494 : register int *iend = iptr - resetcount/2 + 1;
3909 21494 : while (--iptr >= iend) *iptr = -1;
3910 : }
3911 :
3912 : /* Set up the first character to match, if available. The first_byte value is
3913 : never set for an anchored regular expression, but the anchoring may be forced
3914 : at run time, so we have to test for anchoring. The first char may be unset for
3915 : an unanchored pattern, of course. If there's no first char and the pattern was
3916 : studied, there may be a bitmap of possible first characters. */
3917 :
3918 21494 : if (!anchored)
3919 : {
3920 350 : if ((re->options & PCRE_FIRSTSET) != 0)
3921 : {
3922 347 : first_byte = re->first_byte & 255;
3923 347 : if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3924 0 : first_byte = md->lcc[first_byte];
3925 : }
3926 : else
3927 3 : if (!startline && study != NULL &&
3928 : (study->options & PCRE_STUDY_MAPPED) != 0)
3929 0 : start_bits = study->start_bits;
3930 : }
3931 :
3932 : /* For anchored or unanchored matches, there may be a "last known required
3933 : character" set. */
3934 :
3935 21494 : if ((re->options & PCRE_REQCHSET) != 0)
3936 : {
3937 5945 : req_byte = re->req_byte & 255;
3938 5945 : req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3939 5945 : req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3940 : }
3941 :
3942 :
3943 : /* ==========================================================================*/
3944 :
3945 : /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3946 : the loop runs just once. */
3947 :
3948 : for(;;)
3949 : {
3950 21562 : USPTR save_end_subject = end_subject;
3951 :
3952 : /* Reset the maximum number of extractions we might see. */
3953 :
3954 21562 : if (md->offset_vector != NULL)
3955 : {
3956 21562 : register int *iptr = md->offset_vector;
3957 21562 : register int *iend = iptr + resetcount;
3958 21562 : while (iptr < iend) *iptr++ = -1;
3959 : }
3960 :
3961 : /* Advance to a unique first char if possible. If firstline is TRUE, the
3962 : start of the match is constrained to the first line of a multiline string.
3963 : That is, the match must be before or at the first newline. Implement this by
3964 : temporarily adjusting end_subject so that we stop scanning at a newline. If
3965 : the match fails at the newline, later code breaks this loop. */
3966 :
3967 21562 : if (firstline)
3968 : {
3969 0 : USPTR t = start_match;
3970 0 : while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3971 0 : end_subject = t;
3972 : }
3973 :
3974 : /* Now test for a unique first byte */
3975 :
3976 21562 : if (first_byte >= 0)
3977 : {
3978 347 : if (first_byte_caseless)
3979 0 : while (start_match < end_subject &&
3980 : md->lcc[*start_match] != first_byte)
3981 0 : start_match++;
3982 : else
3983 36320 : while (start_match < end_subject && *start_match != first_byte)
3984 35626 : start_match++;
3985 : }
3986 :
3987 : /* Or to just after a linebreak for a multiline match if possible */
3988 :
3989 21215 : else if (startline)
3990 : {
3991 0 : if (start_match > md->start_subject + start_offset)
3992 : {
3993 0 : while (start_match <= end_subject && !WAS_NEWLINE(start_match))
3994 0 : start_match++;
3995 : }
3996 : }
3997 :
3998 : /* Or to a non-unique first char after study */
3999 :
4000 21215 : else if (start_bits != NULL)
4001 : {
4002 0 : while (start_match < end_subject)
4003 : {
4004 0 : register unsigned int c = *start_match;
4005 0 : if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4006 : }
4007 : }
4008 :
4009 : /* Restore fudged end_subject */
4010 :
4011 21562 : end_subject = save_end_subject;
4012 :
4013 : #ifdef DEBUG /* Sigh. Some compilers never learn. */
4014 : printf(">>>> Match against: ");
4015 : pchars(start_match, end_subject - start_match, TRUE, md);
4016 : printf("\n");
4017 : #endif
4018 :
4019 : /* If req_byte is set, we know that that character must appear in the subject
4020 : for the match to succeed. If the first character is set, req_byte must be
4021 : later in the subject; otherwise the test starts at the match point. This
4022 : optimization can save a huge amount of backtracking in patterns with nested
4023 : unlimited repeats that aren't going to match. Writing separate code for
4024 : cased/caseless versions makes it go faster, as does using an autoincrement
4025 : and backing off on a match.
4026 :
4027 : HOWEVER: when the subject string is very, very long, searching to its end can
4028 : take a long time, and give bad performance on quite ordinary patterns. This
4029 : showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4030 : string... so we don't do this when the string is sufficiently long.
4031 :
4032 : ALSO: this processing is disabled when partial matching is requested.
4033 : */
4034 :
4035 21562 : if (req_byte >= 0 &&
4036 : end_subject - start_match < REQ_BYTE_MAX &&
4037 : !md->partial)
4038 : {
4039 5917 : register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4040 :
4041 : /* We don't need to repeat the search if we haven't yet reached the
4042 : place we found it at last time. */
4043 :
4044 5917 : if (p > req_byte_ptr)
4045 : {
4046 5917 : if (req_byte_caseless)
4047 : {
4048 5 : while (p < end_subject)
4049 : {
4050 4 : register int pp = *p++;
4051 4 : if (pp == req_byte || pp == req_byte2) { p--; break; }
4052 : }
4053 : }
4054 : else
4055 : {
4056 92887 : while (p < end_subject)
4057 : {
4058 82687 : if (*p++ == req_byte) { p--; break; }
4059 : }
4060 : }
4061 :
4062 : /* If we can't find the required character, break the matching loop,
4063 : forcing a match failure. */
4064 :
4065 5917 : if (p >= end_subject)
4066 : {
4067 4284 : rc = MATCH_NOMATCH;
4068 4284 : break;
4069 : }
4070 :
4071 : /* If we have found the required character, save the point where we
4072 : found it, so that we don't search again next time round the loop if
4073 : the start hasn't passed this character yet. */
4074 :
4075 1633 : req_byte_ptr = p;
4076 : }
4077 : }
4078 :
4079 : /* OK, we can now run the match. */
4080 :
4081 17278 : md->start_match = start_match;
4082 17278 : md->match_call_count = 0;
4083 17278 : md->eptrn = 0; /* Next free eptrchain slot */
4084 17278 : rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
4085 :
4086 : /* Any return other than MATCH_NOMATCH breaks the loop. */
4087 :
4088 17278 : if (rc != MATCH_NOMATCH) break;
4089 :
4090 : /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4091 : newline in the subject (though it may continue over the newline). Therefore,
4092 : if we have just failed to match, starting at a newline, do not continue. */
4093 :
4094 12253 : if (firstline && IS_NEWLINE(start_match)) break;
4095 :
4096 : /* Advance the match position by one character. */
4097 :
4098 12253 : start_match++;
4099 : #ifdef SUPPORT_UTF8
4100 12253 : if (utf8)
4101 0 : while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4102 0 : start_match++;
4103 : #endif
4104 :
4105 : /* Break the loop if the pattern is anchored or if we have passed the end of
4106 : the subject. */
4107 :
4108 12253 : if (anchored || start_match > end_subject) break;
4109 :
4110 : /* If we have just passed a CR and the newline option is CRLF or ANY, and we
4111 : are now at a LF, advance the match position by one more character. */
4112 :
4113 68 : if (start_match[-1] == '\r' &&
4114 : (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
4115 : start_match < end_subject &&
4116 : *start_match == '\n')
4117 0 : start_match++;
4118 :
4119 68 : } /* End of for(;;) "bumpalong" loop */
4120 :
4121 : /* ==========================================================================*/
4122 :
4123 : /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4124 : conditions is true:
4125 :
4126 : (1) The pattern is anchored;
4127 :
4128 : (2) We are past the end of the subject;
4129 :
4130 : (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4131 : this option requests that a match occur at or before the first newline in
4132 : the subject.
4133 :
4134 : When we have a match and the offset vector is big enough to deal with any
4135 : backreferences, captured substring offsets will already be set up. In the case
4136 : where we had to get some local store to hold offsets for backreference
4137 : processing, copy those that we can. In this case there need not be overflow if
4138 : certain parts of the pattern were not used, even though there are more
4139 : capturing parentheses than vector slots. */
4140 :
4141 21494 : if (rc == MATCH_MATCH)
4142 : {
4143 5025 : if (using_temporary_offsets)
4144 : {
4145 0 : if (offsetcount >= 4)
4146 : {
4147 0 : memcpy(offsets + 2, md->offset_vector + 2,
4148 : (offsetcount - 2) * sizeof(int));
4149 : DPRINTF(("Copied offsets from temporary memory\n"));
4150 : }
4151 0 : if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4152 : DPRINTF(("Freeing temporary memory\n"));
4153 0 : (pcre_free)(md->offset_vector);
4154 : }
4155 :
4156 : /* Set the return code to the number of captured strings, or 0 if there are
4157 : too many to fit into the vector. */
4158 :
4159 5025 : rc = md->offset_overflow? 0 : md->end_offset_top/2;
4160 :
4161 : /* If there is space, set up the whole thing as substring 0. */
4162 :
4163 5025 : if (offsetcount < 2) rc = 0; else
4164 : {
4165 5025 : offsets[0] = start_match - md->start_subject;
4166 5025 : offsets[1] = md->end_match_ptr - md->start_subject;
4167 : }
4168 :
4169 : DPRINTF((">>>> returning %d\n", rc));
4170 5025 : return rc;
4171 : }
4172 :
4173 : /* Control gets here if there has been an error, or if the overall match
4174 : attempt has failed at all permitted starting positions. */
4175 :
4176 16469 : if (using_temporary_offsets)
4177 : {
4178 : DPRINTF(("Freeing temporary memory\n"));
4179 0 : (pcre_free)(md->offset_vector);
4180 : }
4181 :
4182 16469 : if (rc != MATCH_NOMATCH)
4183 : {
4184 : DPRINTF((">>>> error: returning %d\n", rc));
4185 0 : return rc;
4186 : }
4187 16469 : else if (md->partial && md->hitend)
4188 : {
4189 : DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4190 0 : return PCRE_ERROR_PARTIAL;
4191 : }
4192 : else
4193 : {
4194 : DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4195 16469 : return PCRE_ERROR_NOMATCH;
4196 : }
4197 : }
4198 :
4199 : /* End of pcre_exec.c */
|