LCOV - PHP Code Coverage - ext/pcre/pcrelib/pcre

LTP GCOV extension - code coverage report
Current view:	directory - ext/pcre/pcrelib - pcre_compile.c
Test:	PHP Code Coverage
Date:	2007-04-10	Instrumented lines:	1599
Code covered:	30.6 %	Executed lines:	490
Legend:	not executed executed
       1                 : /*************************************************
       2                 : *      Perl-Compatible Regular Expressions       *
       3                 : *************************************************/
       4                 : 
       5                 : /* PCRE is a library of functions to support regular expressions whose syntax
       6                 : and semantics are as close as possible to those of the Perl 5 language.
       7                 : 
       8                 :                        Written by Philip Hazel
       9                 :            Copyright (c) 1997-2006 University of Cambridge
      10                 : 
      11                 : -----------------------------------------------------------------------------
      12                 : Redistribution and use in source and binary forms, with or without
      13                 : modification, are permitted provided that the following conditions are met:
      14                 : 
      15                 :     * Redistributions of source code must retain the above copyright notice,
      16                 :       this list of conditions and the following disclaimer.
      17                 : 
      18                 :     * Redistributions in binary form must reproduce the above copyright
      19                 :       notice, this list of conditions and the following disclaimer in the
      20                 :       documentation and/or other materials provided with the distribution.
      21                 : 
      22                 :     * Neither the name of the University of Cambridge nor the names of its
      23                 :       contributors may be used to endorse or promote products derived from
      24                 :       this software without specific prior written permission.
      25                 : 
      26                 : THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
      27                 : AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
      28                 : IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
      29                 : ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
      30                 : LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
      31                 : CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
      32                 : SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
      33                 : INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
      34                 : CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
      35                 : ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
      36                 : POSSIBILITY OF SUCH DAMAGE.
      37                 : -----------------------------------------------------------------------------
      38                 : */
      39                 : 
      40                 : 
      41                 : /* This module contains the external function pcre_compile(), along with
      42                 : supporting internal functions that are not used by other modules. */
      43                 : 
      44                 : 
      45                 : #define NLBLOCK cd             /* Block containing newline information */
      46                 : #define PSSTART start_pattern  /* Field containing processed string start */
      47                 : #define PSEND   end_pattern    /* Field containing processed string end */
      48                 : 
      49                 : 
      50                 : #include "pcre_internal.h"
      51                 : 
      52                 : 
      53                 : /* When DEBUG is defined, we need the pcre_printint() function, which is also
      54                 : used by pcretest. DEBUG is not defined when building a production library. */
      55                 : 
      56                 : #ifdef DEBUG
      57                 : #include "pcre_printint.src"
      58                 : #endif
      59                 : 
      60                 : 
      61                 : /*************************************************
      62                 : *      Code parameters and static tables         *
      63                 : *************************************************/
      64                 : 
      65                 : /* This value specifies the size of stack workspace that is used during the
      66                 : first pre-compile phase that determines how much memory is required. The regex
      67                 : is partly compiled into this space, but the compiled parts are discarded as
      68                 : soon as they can be, so that hopefully there will never be an overrun. The code
      69                 : does, however, check for an overrun. The largest amount I've seen used is 218,
      70                 : so this number is very generous.
      71                 : 
      72                 : The same workspace is used during the second, actual compile phase for
      73                 : remembering forward references to groups so that they can be filled in at the
      74                 : end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
      75                 : is 4 there is plenty of room. */
      76                 : 
      77                 : #define COMPILE_WORK_SIZE (4096)
      78                 : 
      79                 : 
      80                 : /* Table for handling escaped characters in the range '0'-'z'. Positive returns
      81                 : are simple data values; negative values are for special things like \d and so
      82                 : on. Zero means further processing is needed (for things like \x), or the escape
      83                 : is invalid. */
      84                 : 
      85                 : #if !EBCDIC   /* This is the "normal" table for ASCII systems */
      86                 : static const short int escapes[] = {
      87                 :      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
      88                 :      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
      89                 :    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
      90                 :      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
      91                 : -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
      92                 : -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
      93                 :    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
      94                 :      0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
      95                 : -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
      96                 :      0,      0, -ESC_z                                            /* x - z */
      97                 : };
      98                 : 
      99                 : #else         /* This is the "abnormal" table for EBCDIC systems */
     100                 : static const short int escapes[] = {
     101                 : /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
     102                 : /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
     103                 : /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
     104                 : /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
     105                 : /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
     106                 : /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
     107                 : /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
     108                 : /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
     109                 : /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
     110                 : /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
     111                 : /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
     112                 : /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
     113                 : /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
     114                 : /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
     115                 : /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
     116                 : /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
     117                 : /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
     118                 : /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
     119                 : /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
     120                 : /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
     121                 : /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
     122                 : /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
     123                 : /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
     124                 : };
     125                 : #endif
     126                 : 
     127                 : 
     128                 : /* Tables of names of POSIX character classes and their lengths. The list is
     129                 : terminated by a zero length entry. The first three must be alpha, lower, upper,
     130                 : as this is assumed for handling case independence. */
     131                 : 
     132                 : static const char *const posix_names[] = {
     133                 :   "alpha", "lower", "upper",
     134                 :   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
     135                 :   "print", "punct", "space", "word",  "xdigit" };
     136                 : 
     137                 : static const uschar posix_name_lengths[] = {
     138                 :   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
     139                 : 
     140                 : /* Table of class bit maps for each POSIX class. Each class is formed from a
     141                 : base map, with an optional addition or removal of another map. Then, for some
     142                 : classes, there is some additional tweaking: for [:blank:] the vertical space
     143                 : characters are removed, and for [:alpha:] and [:alnum:] the underscore
     144                 : character is removed. The triples in the table consist of the base map offset,
     145                 : second map offset or -1 if no second map, and a non-negative value for map
     146                 : addition or a negative value for map subtraction (if there are two maps). The
     147                 : absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
     148                 : remove vertical space characters, 2 => remove underscore. */
     149                 : 
     150                 : static const int posix_class_maps[] = {
     151                 :   cbit_word,  cbit_digit, -2,             /* alpha */
     152                 :   cbit_lower, -1,          0,             /* lower */
     153                 :   cbit_upper, -1,          0,             /* upper */
     154                 :   cbit_word,  -1,          2,             /* alnum - word without underscore */
     155                 :   cbit_print, cbit_cntrl,  0,             /* ascii */
     156                 :   cbit_space, -1,          1,             /* blank - a GNU extension */
     157                 :   cbit_cntrl, -1,          0,             /* cntrl */
     158                 :   cbit_digit, -1,          0,             /* digit */
     159                 :   cbit_graph, -1,          0,             /* graph */
     160                 :   cbit_print, -1,          0,             /* print */
     161                 :   cbit_punct, -1,          0,             /* punct */
     162                 :   cbit_space, -1,          0,             /* space */
     163                 :   cbit_word,  -1,          0,             /* word - a Perl extension */
     164                 :   cbit_xdigit,-1,          0              /* xdigit */
     165                 : };
     166                 : 
     167                 : 
     168                 : #define STRING(a)  # a
     169                 : #define XSTRING(s) STRING(s)
     170                 : 
     171                 : /* The texts of compile-time error messages. These are "char *" because they
     172                 : are passed to the outside world. Do not ever re-use any error number, because
     173                 : they are documented. Always add a new error instead. Messages marked DEAD below
     174                 : are no longer used. */
     175                 : 
     176                 : static const char *error_texts[] = {
     177                 :   "no error",
     178                 :   "\\ at end of pattern",
     179                 :   "\\c at end of pattern",
     180                 :   "unrecognized character follows \\",
     181                 :   "numbers out of order in {} quantifier",
     182                 :   /* 5 */
     183                 :   "number too big in {} quantifier",
     184                 :   "missing terminating ] for character class",
     185                 :   "invalid escape sequence in character class",
     186                 :   "range out of order in character class",
     187                 :   "nothing to repeat",
     188                 :   /* 10 */
     189                 :   "operand of unlimited repeat could match the empty string",  /** DEAD **/
     190                 :   "internal error: unexpected repeat",
     191                 :   "unrecognized character after (?",
     192                 :   "POSIX named classes are supported only within a class",
     193                 :   "missing )",
     194                 :   /* 15 */
     195                 :   "reference to non-existent subpattern",
     196                 :   "erroffset passed as NULL",
     197                 :   "unknown option bit(s) set",
     198                 :   "missing ) after comment",
     199                 :   "parentheses nested too deeply",  /** DEAD **/
     200                 :   /* 20 */
     201                 :   "regular expression too large",
     202                 :   "failed to get memory",
     203                 :   "unmatched parentheses",
     204                 :   "internal error: code overflow",
     205                 :   "unrecognized character after (?<",
     206                 :   /* 25 */
     207                 :   "lookbehind assertion is not fixed length",
     208                 :   "malformed number or name after (?(",
     209                 :   "conditional group contains more than two branches",
     210                 :   "assertion expected after (?(",
     211                 :   "(?R or (?digits must be followed by )",
     212                 :   /* 30 */
     213                 :   "unknown POSIX class name",
     214                 :   "POSIX collating elements are not supported",
     215                 :   "this version of PCRE is not compiled with PCRE_UTF8 support",
     216                 :   "spare error",  /** DEAD **/
     217                 :   "character value in \\x{...} sequence is too large",
     218                 :   /* 35 */
     219                 :   "invalid condition (?(0)",
     220                 :   "\\C not allowed in lookbehind assertion",
     221                 :   "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
     222                 :   "number after (?C is > 255",
     223                 :   "closing ) for (?C expected",
     224                 :   /* 40 */
     225                 :   "recursive call could loop indefinitely",
     226                 :   "unrecognized character after (?P",
     227                 :   "syntax error in subpattern name (missing terminator)",
     228                 :   "two named subpatterns have the same name",
     229                 :   "invalid UTF-8 string",
     230                 :   /* 45 */
     231                 :   "support for \\P, \\p, and \\X has not been compiled",
     232                 :   "malformed \\P or \\p sequence",
     233                 :   "unknown property name after \\P or \\p",
     234                 :   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
     235                 :   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
     236                 :   /* 50 */
     237                 :   "repeated subpattern is too long",
     238                 :   "octal value is greater than \\377 (not in UTF-8 mode)",
     239                 :   "internal error: overran compiling workspace",
     240                 :   "internal error: previously-checked referenced subpattern not found",
     241                 :   "DEFINE group contains more than one branch",
     242                 :   /* 55 */
     243                 :   "repeating a DEFINE group is not allowed",
     244                 :   "inconsistent NEWLINE options",
     245                 :   "\\g is not followed by an (optionally braced) non-zero number"
     246                 : };
     247                 : 
     248                 : 
     249                 : /* Table to identify digits and hex digits. This is used when compiling
     250                 : patterns. Note that the tables in chartables are dependent on the locale, and
     251                 : may mark arbitrary characters as digits - but the PCRE compiling code expects
     252                 : to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
     253                 : a private table here. It costs 256 bytes, but it is a lot faster than doing
     254                 : character value tests (at least in some simple cases I timed), and in some
     255                 : applications one wants PCRE to compile efficiently as well as match
     256                 : efficiently.
     257                 : 
     258                 : For convenience, we use the same bit definitions as in chartables:
     259                 : 
     260                 :   0x04   decimal digit
     261                 :   0x08   hexadecimal digit
     262                 : 
     263                 : Then we can use ctype_digit and ctype_xdigit in the code. */
     264                 : 
     265                 : #if !EBCDIC    /* This is the "normal" case, for ASCII systems */
     266                 : static const unsigned char digitab[] =
     267                 :   {
     268                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
     269                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
     270                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
     271                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
     272                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
     273                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
     274                 :   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
     275                 :   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
     276                 :   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
     277                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
     278                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
     279                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
     280                 :   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
     281                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
     282                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
     283                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
     284                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
     285                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
     286                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
     287                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
     288                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
     289                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
     290                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
     291                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
     292                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
     293                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
     294                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
     295                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
     296                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
     297                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
     298                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
     299                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
     300                 : 
     301                 : #else          /* This is the "abnormal" case, for EBCDIC systems */
     302                 : static const unsigned char digitab[] =
     303                 :   {
     304                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
     305                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
     306                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
     307                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
     308                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
     309                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
     310                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
     311                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
     312                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
     313                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
     314                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
     315                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- ¬     */
     316                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
     317                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
     318                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
     319                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
     320                 :   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
     321                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
     322                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
     323                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
     324                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
     325                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
     326                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
     327                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
     328                 :   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
     329                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
     330                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
     331                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
     332                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
     333                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
     334                 :   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
     335                 :   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
     336                 : 
     337                 : static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
     338                 :   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
     339                 :   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
     340                 :   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
     341                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
     342                 :   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
     343                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
     344                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
     345                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
     346                 :   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
     347                 :   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
     348                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
     349                 :   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- ¬  */
     350                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
     351                 :   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
     352                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
     353                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
     354                 :   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
     355                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
     356                 :   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
     357                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
     358                 :   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
     359                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
     360                 :   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
     361                 :   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
     362                 :   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
     363                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
     364                 :   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
     365                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
     366                 :   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
     367                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
     368                 :   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
     369                 :   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
     370                 : #endif
     371                 : 
     372                 : 
     373                 : /* Definition to allow mutual recursion */
     374                 : 
     375                 : static BOOL
     376                 :   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
     377                 :     int *, branch_chain *, compile_data *, int *);
     378                 : 
     379                 : 
     380                 : 
     381                 : /*************************************************
     382                 : *            Handle escapes                      *
     383                 : *************************************************/
     384                 : 
     385                 : /* This function is called when a \ has been encountered. It either returns a
     386                 : positive value for a simple escape such as \n, or a negative value which
     387                 : encodes one of the more complicated things such as \d. A backreference to group
     388                 : n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
     389                 : UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
     390                 : ptr is pointing at the \. On exit, it is on the final character of the escape
     391                 : sequence.
     392                 : 
     393                 : Arguments:
     394                 :   ptrptr         points to the pattern position pointer
     395                 :   errorcodeptr   points to the errorcode variable
     396                 :   bracount       number of previous extracting brackets
     397                 :   options        the options bits
     398                 :   isclass        TRUE if inside a character class
     399                 : 
     400                 : Returns:         zero or positive => a data character
     401                 :                  negative => a special escape sequence
     402                 :                  on error, errorptr is set
     403                 : */
     404                 : 
     405                 : static int
     406                 : check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
     407                 :   int options, BOOL isclass)
     408            7656 : {
     409            7656 : BOOL utf8 = (options & PCRE_UTF8) != 0;
     410            7656 : const uschar *ptr = *ptrptr + 1;
     411                 : int c, i;
     412                 : 
     413            7656 : GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
     414            7656 : ptr--;                            /* Set pointer back to the last byte */
     415                 : 
     416                 : /* If backslash is at the end of the pattern, it's an error. */
     417                 : 
     418            7656 : if (c == 0) *errorcodeptr = ERR1;
     419                 : 
     420                 : /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
     421                 : a table. A non-zero result is something that can be returned immediately.
     422                 : Otherwise further processing may be required. */
     423                 : 
     424                 : #if !EBCDIC    /* ASCII coding */
     425            7656 : else if (c < '0' || c > 'z') {}                           /* Not alphameric */
     426            3990 : else if ((i = escapes[c - '0']) != 0) c = i;
     427                 : 
     428                 : #else          /* EBCDIC coding */
     429                 : else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
     430                 : else if ((i = escapes[c - 0x48]) != 0)  c = i;
     431                 : #endif
     432                 : 
     433                 : /* Escapes that need further processing, or are illegal. */
     434                 : 
     435                 : else
     436                 :   {
     437                 :   const uschar *oldptr;
     438                 :   BOOL braced, negated;
     439                 : 
     440               0 :   switch (c)
     441                 :     {
     442                 :     /* A number of Perl escapes are not handled by PCRE. We give an explicit
     443                 :     error. */
     444                 : 
     445                 :     case 'l':
     446                 :     case 'L':
     447                 :     case 'N':
     448                 :     case 'u':
     449                 :     case 'U':
     450               0 :     *errorcodeptr = ERR37;
     451               0 :     break;
     452                 : 
     453                 :     /* \g must be followed by a number, either plain or braced. If positive, it
     454                 :     is an absolute backreference. If negative, it is a relative backreference.
     455                 :     This is a Perl 5.10 feature. */
     456                 : 
     457                 :     case 'g':
     458               0 :     if (ptr[1] == '{')
     459                 :       {
     460               0 :       braced = TRUE;
     461               0 :       ptr++;
     462                 :       }
     463               0 :     else braced = FALSE;
     464                 : 
     465               0 :     if (ptr[1] == '-')
     466                 :       {
     467               0 :       negated = TRUE;
     468               0 :       ptr++;
     469                 :       }
     470               0 :     else negated = FALSE;
     471                 : 
     472               0 :     c = 0;
     473               0 :     while ((digitab[ptr[1]] & ctype_digit) != 0)
     474               0 :       c = c * 10 + *(++ptr) - '0';
     475                 : 
     476               0 :     if (c == 0 || (braced && *(++ptr) != '}'))
     477                 :       {
     478               0 :       *errorcodeptr = ERR57;
     479               0 :       return 0;
     480                 :       }
     481                 : 
     482               0 :     if (negated)
     483                 :       {
     484               0 :       if (c > bracount)
     485                 :         {
     486               0 :         *errorcodeptr = ERR15;
     487               0 :         return 0;
     488                 :         }
     489               0 :       c = bracount - (c - 1);
     490                 :       }
     491                 : 
     492               0 :     c = -(ESC_REF + c);
     493               0 :     break;
     494                 : 
     495                 :     /* The handling of escape sequences consisting of a string of digits
     496                 :     starting with one that is not zero is not straightforward. By experiment,
     497                 :     the way Perl works seems to be as follows:
     498                 : 
     499                 :     Outside a character class, the digits are read as a decimal number. If the
     500                 :     number is less than 10, or if there are that many previous extracting
     501                 :     left brackets, then it is a back reference. Otherwise, up to three octal
     502                 :     digits are read to form an escaped byte. Thus \123 is likely to be octal
     503                 :     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
     504                 :     value is greater than 377, the least significant 8 bits are taken. Inside a
     505                 :     character class, \ followed by a digit is always an octal number. */
     506                 : 
     507                 :     case '1': case '2': case '3': case '4': case '5':
     508                 :     case '6': case '7': case '8': case '9':
     509                 : 
     510               0 :     if (!isclass)
     511                 :       {
     512               0 :       oldptr = ptr;
     513               0 :       c -= '0';
     514               0 :       while ((digitab[ptr[1]] & ctype_digit) != 0)
     515               0 :         c = c * 10 + *(++ptr) - '0';
     516               0 :       if (c < 10 || c <= bracount)
     517                 :         {
     518               0 :         c = -(ESC_REF + c);
     519               0 :         break;
     520                 :         }
     521               0 :       ptr = oldptr;      /* Put the pointer back and fall through */
     522                 :       }
     523                 : 
     524                 :     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
     525                 :     generates a binary zero byte and treats the digit as a following literal.
     526                 :     Thus we have to pull back the pointer by one. */
     527                 : 
     528               0 :     if ((c = *ptr) >= '8')
     529                 :       {
     530               0 :       ptr--;
     531               0 :       c = 0;
     532               0 :       break;
     533                 :       }
     534                 : 
     535                 :     /* \0 always starts an octal number, but we may drop through to here with a
     536                 :     larger first octal digit. The original code used just to take the least
     537                 :     significant 8 bits of octal numbers (I think this is what early Perls used
     538                 :     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
     539                 :     than 3 octal digits. */
     540                 : 
     541                 :     case '0':
     542               0 :     c -= '0';
     543               0 :     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
     544               0 :         c = c * 8 + *(++ptr) - '0';
     545               0 :     if (!utf8 && c > 255) *errorcodeptr = ERR51;
     546               0 :     break;
     547                 : 
     548                 :     /* \x is complicated. \x{ddd} is a character number which can be greater
     549                 :     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
     550                 :     treated as a data character. */
     551                 : 
     552                 :     case 'x':
     553               0 :     if (ptr[1] == '{')
     554                 :       {
     555               0 :       const uschar *pt = ptr + 2;
     556               0 :       int count = 0;
     557                 : 
     558               0 :       c = 0;
     559               0 :       while ((digitab[*pt] & ctype_xdigit) != 0)
     560                 :         {
     561               0 :         register int cc = *pt++;
     562               0 :         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
     563               0 :         count++;
     564                 : 
     565                 : #if !EBCDIC    /* ASCII coding */
     566               0 :         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
     567               0 :         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
     568                 : #else          /* EBCDIC coding */
     569                 :         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
     570                 :         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
     571                 : #endif
     572                 :         }
     573                 : 
     574               0 :       if (*pt == '}')
     575                 :         {
     576               0 :         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
     577               0 :         ptr = pt;
     578               0 :         break;
     579                 :         }
     580                 : 
     581                 :       /* If the sequence of hex digits does not end with '}', then we don't
     582                 :       recognize this construct; fall through to the normal \x handling. */
     583                 :       }
     584                 : 
     585                 :     /* Read just a single-byte hex-defined char */
     586                 : 
     587               0 :     c = 0;
     588               0 :     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
     589                 :       {
     590                 :       int cc;                               /* Some compilers don't like ++ */
     591               0 :       cc = *(++ptr);                        /* in initializers */
     592                 : #if !EBCDIC    /* ASCII coding */
     593               0 :       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
     594               0 :       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
     595                 : #else          /* EBCDIC coding */
     596                 :       if (cc <= 'z') cc += 64;              /* Convert to upper case */
     597                 :       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
     598                 : #endif
     599                 :       }
     600               0 :     break;
     601                 : 
     602                 :     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
     603                 :     This coding is ASCII-specific, but then the whole concept of \cx is
     604                 :     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
     605                 : 
     606                 :     case 'c':
     607               0 :     c = *(++ptr);
     608               0 :     if (c == 0)
     609                 :       {
     610               0 :       *errorcodeptr = ERR2;
     611               0 :       return 0;
     612                 :       }
     613                 : 
     614                 : #if !EBCDIC    /* ASCII coding */
     615               0 :     if (c >= 'a' && c <= 'z') c -= 32;
     616               0 :     c ^= 0x40;
     617                 : #else          /* EBCDIC coding */
     618                 :     if (c >= 'a' && c <= 'z') c += 64;
     619                 :     c ^= 0xC0;
     620                 : #endif
     621               0 :     break;
     622                 : 
     623                 :     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
     624                 :     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
     625                 :     for Perl compatibility, it is a literal. This code looks a bit odd, but
     626                 :     there used to be some cases other than the default, and there may be again
     627                 :     in future, so I haven't "optimized" it. */
     628                 : 
     629                 :     default:
     630               0 :     if ((options & PCRE_EXTRA) != 0) switch(c)
     631                 :       {
     632                 :       default:
     633               0 :       *errorcodeptr = ERR3;
     634                 :       break;
     635                 :       }
     636                 :     break;
     637                 :     }
     638                 :   }
     639                 : 
     640            7656 : *ptrptr = ptr;
     641            7656 : return c;
     642                 : }
     643                 : 
     644                 : 
     645                 : 
     646                 : #ifdef SUPPORT_UCP
     647                 : /*************************************************
     648                 : *               Handle \P and \p                 *
     649                 : *************************************************/
     650                 : 
     651                 : /* This function is called after \P or \p has been encountered, provided that
     652                 : PCRE is compiled with support for Unicode properties. On entry, ptrptr is
     653                 : pointing at the P or p. On exit, it is pointing at the final character of the
     654                 : escape sequence.
     655                 : 
     656                 : Argument:
     657                 :   ptrptr         points to the pattern position pointer
     658                 :   negptr         points to a boolean that is set TRUE for negation else FALSE
     659                 :   dptr           points to an int that is set to the detailed property value
     660                 :   errorcodeptr   points to the error code variable
     661                 : 
     662                 : Returns:         type value from ucp_type_table, or -1 for an invalid type
     663                 : */
     664                 : 
     665                 : static int
     666                 : get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
     667               0 : {
     668                 : int c, i, bot, top;
     669               0 : const uschar *ptr = *ptrptr;
     670                 : char name[32];
     671                 : 
     672               0 : c = *(++ptr);
     673               0 : if (c == 0) goto ERROR_RETURN;
     674                 : 
     675               0 : *negptr = FALSE;
     676                 : 
     677                 : /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
     678                 : negation. */
     679                 : 
     680               0 : if (c == '{')
     681                 :   {
     682               0 :   if (ptr[1] == '^')
     683                 :     {
     684               0 :     *negptr = TRUE;
     685               0 :     ptr++;
     686                 :     }
     687               0 :   for (i = 0; i < sizeof(name) - 1; i++)
     688                 :     {
     689               0 :     c = *(++ptr);
     690               0 :     if (c == 0) goto ERROR_RETURN;
     691               0 :     if (c == '}') break;
     692               0 :     name[i] = c;
     693                 :     }
     694               0 :   if (c !='}') goto ERROR_RETURN;
     695               0 :   name[i] = 0;
     696                 :   }
     697                 : 
     698                 : /* Otherwise there is just one following character */
     699                 : 
     700                 : else
     701                 :   {
     702               0 :   name[0] = c;
     703               0 :   name[1] = 0;
     704                 :   }
     705                 : 
     706               0 : *ptrptr = ptr;
     707                 : 
     708                 : /* Search for a recognized property name using binary chop */
     709                 : 
     710               0 : bot = 0;
     711               0 : top = _pcre_utt_size;
     712                 : 
     713               0 : while (bot < top)
     714                 :   {
     715               0 :   i = (bot + top) >> 1;
     716               0 :   c = strcmp(name, _pcre_utt[i].name);
     717               0 :   if (c == 0)
     718                 :     {
     719               0 :     *dptr = _pcre_utt[i].value;
     720               0 :     return _pcre_utt[i].type;
     721                 :     }
     722               0 :   if (c > 0) bot = i + 1; else top = i;
     723                 :   }
     724                 : 
     725               0 : *errorcodeptr = ERR47;
     726               0 : *ptrptr = ptr;
     727               0 : return -1;
     728                 : 
     729               0 : ERROR_RETURN:
     730               0 : *errorcodeptr = ERR46;
     731               0 : *ptrptr = ptr;
     732               0 : return -1;
     733                 : }
     734                 : #endif
     735                 : 
     736                 : 
     737                 : 
     738                 : 
     739                 : /*************************************************
     740                 : *            Check for counted repeat            *
     741                 : *************************************************/
     742                 : 
     743                 : /* This function is called when a '{' is encountered in a place where it might
     744                 : start a quantifier. It looks ahead to see if it really is a quantifier or not.
     745                 : It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
     746                 : where the ddds are digits.
     747                 : 
     748                 : Arguments:
     749                 :   p         pointer to the first char after '{'
     750                 : 
     751                 : Returns:    TRUE or FALSE
     752                 : */
     753                 : 
     754                 : static BOOL
     755                 : is_counted_repeat(const uschar *p)
     756               0 : {
     757               0 : if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
     758               0 : while ((digitab[*p] & ctype_digit) != 0) p++;
     759               0 : if (*p == '}') return TRUE;
     760                 : 
     761               0 : if (*p++ != ',') return FALSE;
     762               0 : if (*p == '}') return TRUE;
     763                 : 
     764               0 : if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
     765               0 : while ((digitab[*p] & ctype_digit) != 0) p++;
     766                 : 
     767               0 : return (*p == '}');
     768                 : }
     769                 : 
     770                 : 
     771                 : 
     772                 : /*************************************************
     773                 : *         Read repeat counts                     *
     774                 : *************************************************/
     775                 : 
     776                 : /* Read an item of the form {n,m} and return the values. This is called only
     777                 : after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
     778                 : so the syntax is guaranteed to be correct, but we need to check the values.
     779                 : 
     780                 : Arguments:
     781                 :   p              pointer to first char after '{'
     782                 :   minp           pointer to int for min
     783                 :   maxp           pointer to int for max
     784                 :                  returned as -1 if no max
     785                 :   errorcodeptr   points to error code variable
     786                 : 
     787                 : Returns:         pointer to '}' on success;
     788                 :                  current ptr on error, with errorcodeptr set non-zero
     789                 : */
     790                 : 
     791                 : static const uschar *
     792                 : read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
     793               0 : {
     794               0 : int min = 0;
     795               0 : int max = -1;
     796                 : 
     797                 : /* Read the minimum value and do a paranoid check: a negative value indicates
     798                 : an integer overflow. */
     799                 : 
     800               0 : while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
     801               0 : if (min < 0 || min > 65535)
     802                 :   {
     803               0 :   *errorcodeptr = ERR5;
     804               0 :   return p;
     805                 :   }
     806                 : 
     807                 : /* Read the maximum value if there is one, and again do a paranoid on its size.
     808                 : Also, max must not be less than min. */
     809                 : 
     810               0 : if (*p == '}') max = min; else
     811                 :   {
     812               0 :   if (*(++p) != '}')
     813                 :     {
     814               0 :     max = 0;
     815               0 :     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
     816               0 :     if (max < 0 || max > 65535)
     817                 :       {
     818               0 :       *errorcodeptr = ERR5;
     819               0 :       return p;
     820                 :       }
     821               0 :     if (max < min)
     822                 :       {
     823               0 :       *errorcodeptr = ERR4;
     824               0 :       return p;
     825                 :       }
     826                 :     }
     827                 :   }
     828                 : 
     829                 : /* Fill in the required variables, and pass back the pointer to the terminating
     830                 : '}'. */
     831                 : 
     832               0 : *minp = min;
     833               0 : *maxp = max;
     834               0 : return p;
     835                 : }
     836                 : 
     837                 : 
     838                 : 
     839                 : /*************************************************
     840                 : *       Find forward referenced subpattern       *
     841                 : *************************************************/
     842                 : 
     843                 : /* This function scans along a pattern's text looking for capturing
     844                 : subpatterns, and counting them. If it finds a named pattern that matches the
     845                 : name it is given, it returns its number. Alternatively, if the name is NULL, it
     846                 : returns when it reaches a given numbered subpattern. This is used for forward
     847                 : references to subpatterns. We know that if (?P< is encountered, the name will
     848                 : be terminated by '>' because that is checked in the first pass.
     849                 : 
     850                 : Arguments:
     851                 :   ptr          current position in the pattern
     852                 :   count        current count of capturing parens so far encountered
     853                 :   name         name to seek, or NULL if seeking a numbered subpattern
     854                 :   lorn         name length, or subpattern number if name is NULL
     855                 :   xmode        TRUE if we are in /x mode
     856                 : 
     857                 : Returns:       the number of the named subpattern, or -1 if not found
     858                 : */
     859                 : 
     860                 : static int
     861                 : find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
     862                 :   BOOL xmode)
     863               0 : {
     864                 : const uschar *thisname;
     865                 : 
     866               0 : for (; *ptr != 0; ptr++)
     867                 :   {
     868                 :   int term;
     869                 : 
     870                 :   /* Skip over backslashed characters and also entire \Q...\E */
     871                 : 
     872               0 :   if (*ptr == '\\')
     873                 :     {
     874               0 :     if (*(++ptr) == 0) return -1;
     875               0 :     if (*ptr == 'Q') for (;;)
     876                 :       {
     877               0 :       while (*(++ptr) != 0 && *ptr != '\\');
     878               0 :       if (*ptr == 0) return -1;
     879               0 :       if (*(++ptr) == 'E') break;
     880               0 :       }
     881               0 :     continue;
     882                 :     }
     883                 : 
     884                 :   /* Skip over character classes */
     885                 : 
     886               0 :   if (*ptr == '[')
     887                 :     {
     888               0 :     while (*(++ptr) != ']')
     889                 :       {
     890               0 :       if (*ptr == '\\')
     891                 :         {
     892               0 :         if (*(++ptr) == 0) return -1;
     893               0 :         if (*ptr == 'Q') for (;;)
     894                 :           {
     895               0 :           while (*(++ptr) != 0 && *ptr != '\\');
     896               0 :           if (*ptr == 0) return -1;
     897               0 :           if (*(++ptr) == 'E') break;
     898               0 :           }
     899               0 :         continue;
     900                 :         }
     901                 :       }
     902               0 :     continue;
     903                 :     }
     904                 : 
     905                 :   /* Skip comments in /x mode */
     906                 : 
     907               0 :   if (xmode && *ptr == '#')
     908                 :     {
     909               0 :     while (*(++ptr) != 0 && *ptr != '\n');
     910               0 :     if (*ptr == 0) return -1;
     911               0 :     continue;
     912                 :     }
     913                 : 
     914                 :   /* An opening parens must now be a real metacharacter */
     915                 : 
     916               0 :   if (*ptr != '(') continue;
     917               0 :   if (ptr[1] != '?')
     918                 :     {
     919               0 :     count++;
     920               0 :     if (name == NULL && count == lorn) return count;
     921               0 :     continue;
     922                 :     }
     923                 : 
     924               0 :   ptr += 2;
     925               0 :   if (*ptr == 'P') ptr++;                      /* Allow optional P */
     926                 : 
     927                 :   /* We have to disambiguate (?<! and (?<= from (?<name> */
     928                 : 
     929               0 :   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
     930                 :        *ptr != '\'')
     931               0 :     continue;
     932                 : 
     933               0 :   count++;
     934                 : 
     935               0 :   if (name == NULL && count == lorn) return count;
     936               0 :   term = *ptr++;
     937               0 :   if (term == '<') term = '>';
     938               0 :   thisname = ptr;
     939               0 :   while (*ptr != term) ptr++;
     940               0 :   if (name != NULL && lorn == ptr - thisname &&
     941                 :       strncmp((const char *)name, (const char *)thisname, lorn) == 0)
     942               0 :     return count;
     943                 :   }
     944                 : 
     945               0 : return -1;
     946                 : }
     947                 : 
     948                 : 
     949                 : 
     950                 : /*************************************************
     951                 : *      Find first significant op code            *
     952                 : *************************************************/
     953                 : 
     954                 : /* This is called by several functions that scan a compiled expression looking
     955                 : for a fixed first character, or an anchoring op code etc. It skips over things
     956                 : that do not influence this. For some calls, a change of option is important.
     957                 : For some calls, it makes sense to skip negative forward and all backward
     958                 : assertions, and also the \b assertion; for others it does not.
     959                 : 
     960                 : Arguments:
     961                 :   code         pointer to the start of the group
     962                 :   options      pointer to external options
     963                 :   optbit       the option bit whose changing is significant, or
     964                 :                  zero if none are
     965                 :   skipassert   TRUE if certain assertions are to be skipped
     966                 : 
     967                 : Returns:       pointer to the first significant opcode
     968                 : */
     969                 : 
     970                 : static const uschar*
     971                 : first_significant_code(const uschar *code, int *options, int optbit,
     972                 :   BOOL skipassert)
     973             186 : {
     974                 : for (;;)
     975                 :   {
     976             186 :   switch ((int)*code)
     977                 :     {
     978                 :     case OP_OPT:
     979               0 :     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
     980               0 :       *options = (int)code[1];
     981               0 :     code += 2;
     982               0 :     break;
     983                 : 
     984                 :     case OP_ASSERT_NOT:
     985                 :     case OP_ASSERTBACK:
     986                 :     case OP_ASSERTBACK_NOT:
     987               0 :     if (!skipassert) return code;
     988               0 :     do code += GET(code, 1); while (*code == OP_ALT);
     989               0 :     code += _pcre_OP_lengths[*code];
     990               0 :     break;
     991                 : 
     992                 :     case OP_WORD_BOUNDARY:
     993                 :     case OP_NOT_WORD_BOUNDARY:
     994               0 :     if (!skipassert) return code;
     995                 :     /* Fall through */
     996                 : 
     997                 :     case OP_CALLOUT:
     998                 :     case OP_CREF:
     999                 :     case OP_RREF:
    1000                 :     case OP_DEF:
    1001               0 :     code += _pcre_OP_lengths[*code];
    1002               0 :     break;
    1003                 : 
    1004                 :     default:
    1005             186 :     return code;
    1006                 :     }
    1007               0 :   }
    1008                 : /* Control never reaches here */
    1009                 : }
    1010                 : 
    1011                 : 
    1012                 : 
    1013                 : 
    1014                 : /*************************************************
    1015                 : *        Find the fixed length of a pattern      *
    1016                 : *************************************************/
    1017                 : 
    1018                 : /* Scan a pattern and compute the fixed length of subject that will match it,
    1019                 : if the length is fixed. This is needed for dealing with backward assertions.
    1020                 : In UTF8 mode, the result is in characters rather than bytes.
    1021                 : 
    1022                 : Arguments:
    1023                 :   code     points to the start of the pattern (the bracket)
    1024                 :   options  the compiling options
    1025                 : 
    1026                 : Returns:   the fixed length, or -1 if there is no fixed length,
    1027                 :              or -2 if \C was encountered
    1028                 : */
    1029                 : 
    1030                 : static int
    1031                 : find_fixedlength(uschar *code, int options)
    1032               0 : {
    1033               0 : int length = -1;
    1034                 : 
    1035               0 : register int branchlength = 0;
    1036               0 : register uschar *cc = code + 1 + LINK_SIZE;
    1037                 : 
    1038                 : /* Scan along the opcodes for this branch. If we get to the end of the
    1039                 : branch, check the length against that of the other branches. */
    1040                 : 
    1041                 : for (;;)
    1042                 :   {
    1043                 :   int d;
    1044               0 :   register int op = *cc;
    1045                 : 
    1046               0 :   switch (op)
    1047                 :     {
    1048                 :     case OP_CBRA:
    1049                 :     case OP_BRA:
    1050                 :     case OP_ONCE:
    1051                 :     case OP_COND:
    1052               0 :     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
    1053               0 :     if (d < 0) return d;
    1054               0 :     branchlength += d;
    1055               0 :     do cc += GET(cc, 1); while (*cc == OP_ALT);
    1056               0 :     cc += 1 + LINK_SIZE;
    1057               0 :     break;
    1058                 : 
    1059                 :     /* Reached end of a branch; if it's a ket it is the end of a nested
    1060                 :     call. If it's ALT it is an alternation in a nested call. If it is
    1061                 :     END it's the end of the outer call. All can be handled by the same code. */
    1062                 : 
    1063                 :     case OP_ALT:
    1064                 :     case OP_KET:
    1065                 :     case OP_KETRMAX:
    1066                 :     case OP_KETRMIN:
    1067                 :     case OP_END:
    1068               0 :     if (length < 0) length = branchlength;
    1069               0 :       else if (length != branchlength) return -1;
    1070               0 :     if (*cc != OP_ALT) return length;
    1071               0 :     cc += 1 + LINK_SIZE;
    1072               0 :     branchlength = 0;
    1073               0 :     break;
    1074                 : 
    1075                 :     /* Skip over assertive subpatterns */
    1076                 : 
    1077                 :     case OP_ASSERT:
    1078                 :     case OP_ASSERT_NOT:
    1079                 :     case OP_ASSERTBACK:
    1080                 :     case OP_ASSERTBACK_NOT:
    1081               0 :     do cc += GET(cc, 1); while (*cc == OP_ALT);
    1082                 :     /* Fall through */
    1083                 : 
    1084                 :     /* Skip over things that don't match chars */
    1085                 : 
    1086                 :     case OP_REVERSE:
    1087                 :     case OP_CREF:
    1088                 :     case OP_RREF:
    1089                 :     case OP_DEF:
    1090                 :     case OP_OPT:
    1091                 :     case OP_CALLOUT:
    1092                 :     case OP_SOD:
    1093                 :     case OP_SOM:
    1094                 :     case OP_EOD:
    1095                 :     case OP_EODN:
    1096                 :     case OP_CIRC:
    1097                 :     case OP_DOLL:
    1098                 :     case OP_NOT_WORD_BOUNDARY:
    1099                 :     case OP_WORD_BOUNDARY:
    1100               0 :     cc += _pcre_OP_lengths[*cc];
    1101               0 :     break;
    1102                 : 
    1103                 :     /* Handle literal characters */
    1104                 : 
    1105                 :     case OP_CHAR:
    1106                 :     case OP_CHARNC:
    1107                 :     case OP_NOT:
    1108               0 :     branchlength++;
    1109               0 :     cc += 2;
    1110                 : #ifdef SUPPORT_UTF8
    1111               0 :     if ((options & PCRE_UTF8) != 0)
    1112                 :       {
    1113               0 :       while ((*cc & 0xc0) == 0x80) cc++;
    1114                 :       }
    1115                 : #endif
    1116               0 :     break;
    1117                 : 
    1118                 :     /* Handle exact repetitions. The count is already in characters, but we
    1119                 :     need to skip over a multibyte character in UTF8 mode.  */
    1120                 : 
    1121                 :     case OP_EXACT:
    1122               0 :     branchlength += GET2(cc,1);
    1123               0 :     cc += 4;
    1124                 : #ifdef SUPPORT_UTF8
    1125               0 :     if ((options & PCRE_UTF8) != 0)
    1126                 :       {
    1127               0 :       while((*cc & 0x80) == 0x80) cc++;
    1128                 :       }
    1129                 : #endif
    1130               0 :     break;
    1131                 : 
    1132                 :     case OP_TYPEEXACT:
    1133               0 :     branchlength += GET2(cc,1);
    1134               0 :     cc += 4;
    1135               0 :     break;
    1136                 : 
    1137                 :     /* Handle single-char matchers */
    1138                 : 
    1139                 :     case OP_PROP:
    1140                 :     case OP_NOTPROP:
    1141               0 :     cc += 2;
    1142                 :     /* Fall through */
    1143                 : 
    1144                 :     case OP_NOT_DIGIT:
    1145                 :     case OP_DIGIT:
    1146                 :     case OP_NOT_WHITESPACE:
    1147                 :     case OP_WHITESPACE:
    1148                 :     case OP_NOT_WORDCHAR:
    1149                 :     case OP_WORDCHAR:
    1150                 :     case OP_ANY:
    1151               0 :     branchlength++;
    1152               0 :     cc++;
    1153               0 :     break;
    1154                 : 
    1155                 :     /* The single-byte matcher isn't allowed */
    1156                 : 
    1157                 :     case OP_ANYBYTE:
    1158               0 :     return -2;
    1159                 : 
    1160                 :     /* Check a class for variable quantification */
    1161                 : 
    1162                 : #ifdef SUPPORT_UTF8
    1163                 :     case OP_XCLASS:
    1164               0 :     cc += GET(cc, 1) - 33;
    1165                 :     /* Fall through */
    1166                 : #endif
    1167                 : 
    1168                 :     case OP_CLASS:
    1169                 :     case OP_NCLASS:
    1170               0 :     cc += 33;
    1171                 : 
    1172               0 :     switch (*cc)
    1173                 :       {
    1174                 :       case OP_CRSTAR:
    1175                 :       case OP_CRMINSTAR:
    1176                 :       case OP_CRQUERY:
    1177                 :       case OP_CRMINQUERY:
    1178               0 :       return -1;
    1179                 : 
    1180                 :       case OP_CRRANGE:
    1181                 :       case OP_CRMINRANGE:
    1182               0 :       if (GET2(cc,1) != GET2(cc,3)) return -1;
    1183               0 :       branchlength += GET2(cc,1);
    1184               0 :       cc += 5;
    1185               0 :       break;
    1186                 : 
    1187                 :       default:
    1188               0 :       branchlength++;
    1189                 :       }
    1190               0 :     break;
    1191                 : 
    1192                 :     /* Anything else is variable length */
    1193                 : 
    1194                 :     default:
    1195               0 :     return -1;
    1196                 :     }
    1197               0 :   }
    1198                 : /* Control never gets here */
    1199                 : }
    1200                 : 
    1201                 : 
    1202                 : 
    1203                 : 
    1204                 : /*************************************************
    1205                 : *    Scan compiled regex for numbered bracket    *
    1206                 : *************************************************/
    1207                 : 
    1208                 : /* This little function scans through a compiled pattern until it finds a
    1209                 : capturing bracket with the given number.
    1210                 : 
    1211                 : Arguments:
    1212                 :   code        points to start of expression
    1213                 :   utf8        TRUE in UTF-8 mode
    1214                 :   number      the required bracket number
    1215                 : 
    1216                 : Returns:      pointer to the opcode for the bracket, or NULL if not found
    1217                 : */
    1218                 : 
    1219                 : static const uschar *
    1220                 : find_bracket(const uschar *code, BOOL utf8, int number)
    1221               0 : {
    1222                 : for (;;)
    1223                 :   {
    1224               0 :   register int c = *code;
    1225               0 :   if (c == OP_END) return NULL;
    1226                 : 
    1227                 :   /* XCLASS is used for classes that cannot be represented just by a bit
    1228                 :   map. This includes negated single high-valued characters. The length in
    1229                 :   the table is zero; the actual length is stored in the compiled code. */
    1230                 : 
    1231               0 :   if (c == OP_XCLASS) code += GET(code, 1);
    1232                 : 
    1233                 :   /* Handle capturing bracket */
    1234                 : 
    1235               0 :   else if (c == OP_CBRA)
    1236                 :     {
    1237               0 :     int n = GET2(code, 1+LINK_SIZE);
    1238               0 :     if (n == number) return (uschar *)code;
    1239               0 :     code += _pcre_OP_lengths[c];
    1240                 :     }
    1241                 : 
    1242                 :   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
    1243                 :   a multi-byte character. The length in the table is a minimum, so we have to
    1244                 :   arrange to skip the extra bytes. */
    1245                 : 
    1246                 :   else
    1247                 :     {
    1248               0 :     code += _pcre_OP_lengths[c];
    1249               0 :     if (utf8) switch(c)
    1250                 :       {
    1251                 :       case OP_CHAR:
    1252                 :       case OP_CHARNC:
    1253                 :       case OP_EXACT:
    1254                 :       case OP_UPTO:
    1255                 :       case OP_MINUPTO:
    1256                 :       case OP_POSUPTO:
    1257                 :       case OP_STAR:
    1258                 :       case OP_MINSTAR:
    1259                 :       case OP_POSSTAR:
    1260                 :       case OP_PLUS:
    1261                 :       case OP_MINPLUS:
    1262                 :       case OP_POSPLUS:
    1263                 :       case OP_QUERY:
    1264                 :       case OP_MINQUERY:
    1265                 :       case OP_POSQUERY:
    1266               0 :       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
    1267                 :       break;
    1268                 :       }
    1269                 :     }
    1270               0 :   }
    1271                 : }
    1272                 : 
    1273                 : 
    1274                 : 
    1275                 : /*************************************************
    1276                 : *   Scan compiled regex for recursion reference  *
    1277                 : *************************************************/
    1278                 : 
    1279                 : /* This little function scans through a compiled pattern until it finds an
    1280                 : instance of OP_RECURSE.
    1281                 : 
    1282                 : Arguments:
    1283                 :   code        points to start of expression
    1284                 :   utf8        TRUE in UTF-8 mode
    1285                 : 
    1286                 : Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
    1287                 : */
    1288                 : 
    1289                 : static const uschar *
    1290                 : find_recurse(const uschar *code, BOOL utf8)
    1291             170 : {
    1292                 : for (;;)
    1293                 :   {
    1294             170 :   register int c = *code;
    1295             170 :   if (c == OP_END) return NULL;
    1296             136 :   if (c == OP_RECURSE) return code;
    1297                 : 
    1298                 :   /* XCLASS is used for classes that cannot be represented just by a bit
    1299                 :   map. This includes negated single high-valued characters. The length in
    1300                 :   the table is zero; the actual length is stored in the compiled code. */
    1301                 : 
    1302             136 :   if (c == OP_XCLASS) code += GET(code, 1);
    1303                 : 
    1304                 :   /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
    1305                 :   that are followed by a character may be followed by a multi-byte character.
    1306                 :   The length in the table is a minimum, so we have to arrange to skip the extra
    1307                 :   bytes. */
    1308                 : 
    1309                 :   else
    1310                 :     {
    1311             136 :     code += _pcre_OP_lengths[c];
    1312             136 :     if (utf8) switch(c)
    1313                 :       {
    1314                 :       case OP_CHAR:
    1315                 :       case OP_CHARNC:
    1316                 :       case OP_EXACT:
    1317                 :       case OP_UPTO:
    1318                 :       case OP_MINUPTO:
    1319                 :       case OP_POSUPTO:
    1320                 :       case OP_STAR:
    1321                 :       case OP_MINSTAR:
    1322                 :       case OP_POSSTAR:
    1323                 :       case OP_PLUS:
    1324                 :       case OP_MINPLUS:
    1325                 :       case OP_POSPLUS:
    1326                 :       case OP_QUERY:
    1327                 :       case OP_MINQUERY:
    1328                 :       case OP_POSQUERY:
    1329               0 :       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
    1330                 :       break;
    1331                 :       }
    1332                 :     }
    1333             136 :   }
    1334                 : }
    1335                 : 
    1336                 : 
    1337                 : 
    1338                 : /*************************************************
    1339                 : *    Scan compiled branch for non-emptiness      *
    1340                 : *************************************************/
    1341                 : 
    1342                 : /* This function scans through a branch of a compiled pattern to see whether it
    1343                 : can match the empty string or not. It is called from could_be_empty()
    1344                 : below and from compile_branch() when checking for an unlimited repeat of a
    1345                 : group that can match nothing. Note that first_significant_code() skips over
    1346                 : assertions. If we hit an unclosed bracket, we return "empty" - this means we've
    1347                 : struck an inner bracket whose current branch will already have been scanned.
    1348                 : 
    1349                 : Arguments:
    1350                 :   code        points to start of search
    1351                 :   endcode     points to where to stop
    1352                 :   utf8        TRUE if in UTF8 mode
    1353                 : 
    1354                 : Returns:      TRUE if what is matched could be empty
    1355                 : */
    1356                 : 
    1357                 : static BOOL
    1358                 : could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
    1359               0 : {
    1360                 : register int c;
    1361               0 : for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
    1362               0 :      code < endcode;
    1363               0 :      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
    1364                 :   {
    1365                 :   const uschar *ccode;
    1366                 : 
    1367               0 :   c = *code;
    1368                 : 
    1369               0 :   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
    1370                 :     {
    1371                 :     BOOL empty_branch;
    1372               0 :     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
    1373                 : 
    1374                 :     /* Scan a closed bracket */
    1375                 : 
    1376               0 :     empty_branch = FALSE;
    1377                 :     do
    1378                 :       {
    1379               0 :       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
    1380               0 :         empty_branch = TRUE;
    1381               0 :       code += GET(code, 1);
    1382                 :       }
    1383               0 :     while (*code == OP_ALT);
    1384               0 :     if (!empty_branch) return FALSE;   /* All branches are non-empty */
    1385                 : 
    1386                 :     /* Move past the KET and fudge things so that the increment in the "for"
    1387                 :     above has no effect. */
    1388                 : 
    1389               0 :     c = OP_END;
    1390               0 :     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
    1391               0 :     continue;
    1392                 :     }
    1393                 : 
    1394                 :   /* Handle the other opcodes */
    1395                 : 
    1396               0 :   switch (c)
    1397                 :     {
    1398                 :     /* Check for quantifiers after a class */
    1399                 : 
    1400                 : #ifdef SUPPORT_UTF8
    1401                 :     case OP_XCLASS:
    1402               0 :     ccode = code + GET(code, 1);
    1403               0 :     goto CHECK_CLASS_REPEAT;
    1404                 : #endif
    1405                 : 
    1406                 :     case OP_CLASS:
    1407                 :     case OP_NCLASS:
    1408               0 :     ccode = code + 33;
    1409                 : 
    1410                 : #ifdef SUPPORT_UTF8
    1411               0 :     CHECK_CLASS_REPEAT:
    1412                 : #endif
    1413                 : 
    1414               0 :     switch (*ccode)
    1415                 :       {
    1416                 :       case OP_CRSTAR:            /* These could be empty; continue */
    1417                 :       case OP_CRMINSTAR:
    1418                 :       case OP_CRQUERY:
    1419                 :       case OP_CRMINQUERY:
    1420               0 :       break;
    1421                 : 
    1422                 :       default:                   /* Non-repeat => class must match */
    1423                 :       case OP_CRPLUS:            /* These repeats aren't empty */
    1424                 :       case OP_CRMINPLUS:
    1425               0 :       return FALSE;
    1426                 : 
    1427                 :       case OP_CRRANGE:
    1428                 :       case OP_CRMINRANGE:
    1429               0 :       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
    1430                 :       break;
    1431                 :       }
    1432               0 :     break;
    1433                 : 
    1434                 :     /* Opcodes that must match a character */
    1435                 : 
    1436                 :     case OP_PROP:
    1437                 :     case OP_NOTPROP:
    1438                 :     case OP_EXTUNI:
    1439                 :     case OP_NOT_DIGIT:
    1440                 :     case OP_DIGIT:
    1441                 :     case OP_NOT_WHITESPACE:
    1442                 :     case OP_WHITESPACE:
    1443                 :     case OP_NOT_WORDCHAR:
    1444                 :     case OP_WORDCHAR:
    1445                 :     case OP_ANY:
    1446                 :     case OP_ANYBYTE:
    1447                 :     case OP_CHAR:
    1448                 :     case OP_CHARNC:
    1449                 :     case OP_NOT:
    1450                 :     case OP_PLUS:
    1451                 :     case OP_MINPLUS:
    1452                 :     case OP_POSPLUS:
    1453                 :     case OP_EXACT:
    1454                 :     case OP_NOTPLUS:
    1455                 :     case OP_NOTMINPLUS:
    1456                 :     case OP_NOTPOSPLUS:
    1457                 :     case OP_NOTEXACT:
    1458                 :     case OP_TYPEPLUS:
    1459                 :     case OP_TYPEMINPLUS:
    1460                 :     case OP_TYPEPOSPLUS:
    1461                 :     case OP_TYPEEXACT:
    1462               0 :     return FALSE;
    1463                 : 
    1464                 :     /* End of branch */
    1465                 : 
    1466                 :     case OP_KET:
    1467                 :     case OP_KETRMAX:
    1468                 :     case OP_KETRMIN:
    1469                 :     case OP_ALT:
    1470               0 :     return TRUE;
    1471                 : 
    1472                 :     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
    1473                 :     MINUPTO, and POSUPTO may be followed by a multibyte character */
    1474                 : 
    1475                 : #ifdef SUPPORT_UTF8
    1476                 :     case OP_STAR:
    1477                 :     case OP_MINSTAR:
    1478                 :     case OP_POSSTAR:
    1479                 :     case OP_QUERY:
    1480                 :     case OP_MINQUERY:
    1481                 :     case OP_POSQUERY:
    1482                 :     case OP_UPTO:
    1483                 :     case OP_MINUPTO:
    1484                 :     case OP_POSUPTO:
    1485               0 :     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
    1486                 :     break;
    1487                 : #endif
    1488                 :     }
    1489                 :   }
    1490                 : 
    1491               0 : return TRUE;
    1492                 : }
    1493                 : 
    1494                 : 
    1495                 : 
    1496                 : /*************************************************
    1497                 : *    Scan compiled regex for non-emptiness       *
    1498                 : *************************************************/
    1499                 : 
    1500                 : /* This function is called to check for left recursive calls. We want to check
    1501                 : the current branch of the current pattern to see if it could match the empty
    1502                 : string. If it could, we must look outwards for branches at other levels,
    1503                 : stopping when we pass beyond the bracket which is the subject of the recursion.
    1504                 : 
    1505                 : Arguments:
    1506                 :   code        points to start of the recursion
    1507                 :   endcode     points to where to stop (current RECURSE item)
    1508                 :   bcptr       points to the chain of current (unclosed) branch starts
    1509                 :   utf8        TRUE if in UTF-8 mode
    1510                 : 
    1511                 : Returns:      TRUE if what is matched could be empty
    1512                 : */
    1513                 : 
    1514                 : static BOOL
    1515                 : could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
    1516                 :   BOOL utf8)
    1517               0 : {
    1518               0 : while (bcptr != NULL && bcptr->current >= code)
    1519                 :   {
    1520               0 :   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
    1521               0 :   bcptr = bcptr->outer;
    1522                 :   }
    1523               0 : return TRUE;
    1524                 : }
    1525                 : 
    1526                 : 
    1527                 : 
    1528                 : /*************************************************
    1529                 : *           Check for POSIX class syntax         *
    1530                 : *************************************************/
    1531                 : 
    1532                 : /* This function is called when the sequence "[:" or "[." or "[=" is
    1533                 : encountered in a character class. It checks whether this is followed by an
    1534                 : optional ^ and then a sequence of letters, terminated by a matching ":]" or
    1535                 : ".]" or "=]".
    1536                 : 
    1537                 : Argument:
    1538                 :   ptr      pointer to the initial [
    1539                 :   endptr   where to return the end pointer
    1540                 :   cd       pointer to compile data
    1541                 : 
    1542                 : Returns:   TRUE or FALSE
    1543                 : */
    1544                 : 
    1545                 : static BOOL
    1546                 : check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
    1547               0 : {
    1548                 : int terminator;          /* Don't combine these lines; the Solaris cc */
    1549               0 : terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
    1550               0 : if (*(++ptr) == '^') ptr++;
    1551               0 : while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
    1552               0 : if (*ptr == terminator && ptr[1] == ']')
    1553                 :   {
    1554               0 :   *endptr = ptr;
    1555               0 :   return TRUE;
    1556                 :   }
    1557               0 : return FALSE;
    1558                 : }
    1559                 : 
    1560                 : 
    1561                 : 
    1562                 : 
    1563                 : /*************************************************
    1564                 : *          Check POSIX class name                *
    1565                 : *************************************************/
    1566                 : 
    1567                 : /* This function is called to check the name given in a POSIX-style class entry
    1568                 : such as [:alnum:].
    1569                 : 
    1570                 : Arguments:
    1571                 :   ptr        points to the first letter
    1572                 :   len        the length of the name
    1573                 : 
    1574                 : Returns:     a value representing the name, or -1 if unknown
    1575                 : */
    1576                 : 
    1577                 : static int
    1578                 : check_posix_name(const uschar *ptr, int len)
    1579               0 : {
    1580               0 : register int yield = 0;
    1581               0 : while (posix_name_lengths[yield] != 0)
    1582                 :   {
    1583               0 :   if (len == posix_name_lengths[yield] &&
    1584               0 :     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
    1585               0 :   yield++;
    1586                 :   }
    1587               0 : return -1;
    1588                 : }
    1589                 : 
    1590                 : 
    1591                 : /*************************************************
    1592                 : *    Adjust OP_RECURSE items in repeated group   *
    1593                 : *************************************************/
    1594                 : 
    1595                 : /* OP_RECURSE items contain an offset from the start of the regex to the group
    1596                 : that is referenced. This means that groups can be replicated for fixed
    1597                 : repetition simply by copying (because the recursion is allowed to refer to
    1598                 : earlier groups that are outside the current group). However, when a group is
    1599                 : optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
    1600                 : it, after it has been compiled. This means that any OP_RECURSE items within it
    1601                 : that refer to the group itself or any contained groups have to have their
    1602                 : offsets adjusted. That one of the jobs of this function. Before it is called,
    1603                 : the partially compiled regex must be temporarily terminated with OP_END.
    1604                 : 
    1605                 : This function has been extended with the possibility of forward references for
    1606                 : recursions and subroutine calls. It must also check the list of such references
    1607                 : for the group we are dealing with. If it finds that one of the recursions in
    1608                 : the current group is on this list, it adjusts the offset in the list, not the
    1609                 : value in the reference (which is a group number).
    1610                 : 
    1611                 : Arguments:
    1612                 :   group      points to the start of the group
    1613                 :   adjust     the amount by which the group is to be moved
    1614                 :   utf8       TRUE in UTF-8 mode
    1615                 :   cd         contains pointers to tables etc.
    1616                 :   save_hwm   the hwm forward reference pointer at the start of the group
    1617                 : 
    1618                 : Returns:     nothing
    1619                 : */
    1620                 : 
    1621                 : static void
    1622                 : adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
    1623                 :   uschar *save_hwm)
    1624              34 : {
    1625              34 : uschar *ptr = group;
    1626              68 : while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
    1627                 :   {
    1628                 :   int offset;
    1629                 :   uschar *hc;
    1630                 : 
    1631                 :   /* See if this recursion is on the forward reference list. If so, adjust the
    1632                 :   reference. */
    1633                 : 
    1634               0 :   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
    1635                 :     {
    1636               0 :     offset = GET(hc, 0);
    1637               0 :     if (cd->start_code + offset == ptr + 1)
    1638                 :       {
    1639               0 :       PUT(hc, 0, offset + adjust);
    1640               0 :       break;
    1641                 :       }
    1642                 :     }
    1643                 : 
    1644                 :   /* Otherwise, adjust the recursion offset if it's after the start of this
    1645                 :   group. */
    1646                 : 
    1647               0 :   if (hc >= cd->hwm)
    1648                 :     {
    1649               0 :     offset = GET(ptr, 1);
    1650               0 :     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
    1651                 :     }
    1652                 : 
    1653               0 :   ptr += 1 + LINK_SIZE;
    1654                 :   }
    1655              34 : }
    1656                 : 
    1657                 : 
    1658                 : 
    1659                 : /*************************************************
    1660                 : *        Insert an automatic callout point       *
    1661                 : *************************************************/
    1662                 : 
    1663                 : /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
    1664                 : callout points before each pattern item.
    1665                 : 
    1666                 : Arguments:
    1667                 :   code           current code pointer
    1668                 :   ptr            current pattern pointer
    1669                 :   cd             pointers to tables etc
    1670                 : 
    1671                 : Returns:         new code pointer
    1672                 : */
    1673                 : 
    1674                 : static uschar *
    1675                 : auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
    1676               0 : {
    1677               0 : *code++ = OP_CALLOUT;
    1678               0 : *code++ = 255;
    1679               0 : PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
    1680               0 : PUT(code, LINK_SIZE, 0);                /* Default length */
    1681               0 : return code + 2*LINK_SIZE;
    1682                 : }
    1683                 : 
    1684                 : 
    1685                 : 
    1686                 : /*************************************************
    1687                 : *         Complete a callout item                *
    1688                 : *************************************************/
    1689                 : 
    1690                 : /* A callout item contains the length of the next item in the pattern, which
    1691                 : we can't fill in till after we have reached the relevant point. This is used
    1692                 : for both automatic and manual callouts.
    1693                 : 
    1694                 : Arguments:
    1695                 :   previous_callout   points to previous callout item
    1696                 :   ptr                current pattern pointer
    1697                 :   cd                 pointers to tables etc
    1698                 : 
    1699                 : Returns:             nothing
    1700                 : */
    1701                 : 
    1702                 : static void
    1703                 : complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
    1704               0 : {
    1705               0 : int length = ptr - cd->start_pattern - GET(previous_callout, 2);
    1706               0 : PUT(previous_callout, 2 + LINK_SIZE, length);
    1707               0 : }
    1708                 : 
    1709                 : 
    1710                 : 
    1711                 : #ifdef SUPPORT_UCP
    1712                 : /*************************************************
    1713                 : *           Get othercase range                  *
    1714                 : *************************************************/
    1715                 : 
    1716                 : /* This function is passed the start and end of a class range, in UTF-8 mode
    1717                 : with UCP support. It searches up the characters, looking for internal ranges of
    1718                 : characters in the "other" case. Each call returns the next one, updating the
    1719                 : start address.
    1720                 : 
    1721                 : Arguments:
    1722                 :   cptr        points to starting character value; updated
    1723                 :   d           end value
    1724                 :   ocptr       where to put start of othercase range
    1725                 :   odptr       where to put end of othercase range
    1726                 : 
    1727                 : Yield:        TRUE when range returned; FALSE when no more
    1728                 : */
    1729                 : 
    1730                 : static BOOL
    1731                 : get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
    1732                 :   unsigned int *odptr)
    1733               0 : {
    1734                 : unsigned int c, othercase, next;
    1735                 : 
    1736               0 : for (c = *cptr; c <= d; c++)
    1737               0 :   { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
    1738                 : 
    1739               0 : if (c > d) return FALSE;
    1740                 : 
    1741               0 : *ocptr = othercase;
    1742               0 : next = othercase + 1;
    1743                 : 
    1744               0 : for (++c; c <= d; c++)
    1745                 :   {
    1746               0 :   if (_pcre_ucp_othercase(c) != next) break;
    1747               0 :   next++;
    1748                 :   }
    1749                 : 
    1750               0 : *odptr = next - 1;
    1751               0 : *cptr = c;
    1752                 : 
    1753               0 : return TRUE;
    1754                 : }
    1755                 : #endif  /* SUPPORT_UCP */
    1756                 : 
    1757                 : 
    1758                 : 
    1759                 : /*************************************************
    1760                 : *     Check if auto-possessifying is possible    *
    1761                 : *************************************************/
    1762                 : 
    1763                 : /* This function is called for unlimited repeats of certain items, to see
    1764                 : whether the next thing could possibly match the repeated item. If not, it makes
    1765                 : sense to automatically possessify the repeated item.
    1766                 : 
    1767                 : Arguments:
    1768                 :   op_code       the repeated op code
    1769                 :   this          data for this item, depends on the opcode
    1770                 :   utf8          TRUE in UTF-8 mode
    1771                 :   utf8_char     used for utf8 character bytes, NULL if not relevant
    1772                 :   ptr           next character in pattern
    1773                 :   options       options bits
    1774                 :   cd            contains pointers to tables etc.
    1775                 : 
    1776                 : Returns:        TRUE if possessifying is wanted
    1777                 : */
    1778                 : 
    1779                 : static BOOL
    1780                 : check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
    1781                 :   const uschar *ptr, int options, compile_data *cd)
    1782             494 : {
    1783                 : int next;
    1784                 : 
    1785                 : /* Skip whitespace and comments in extended mode */
    1786                 : 
    1787             494 : if ((options & PCRE_EXTENDED) != 0)
    1788                 :   {
    1789                 :   for (;;)
    1790                 :     {
    1791               0 :     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
    1792               0 :     if (*ptr == '#')
    1793                 :       {
    1794               0 :       while (*(++ptr) != 0)
    1795               0 :         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
    1796                 :       }
    1797               0 :     else break;
    1798               0 :     }
    1799                 :   }
    1800                 : 
    1801                 : /* If the next item is one that we can handle, get its value. A non-negative
    1802                 : value is a character, a negative value is an escape value. */
    1803                 : 
    1804             494 : if (*ptr == '\\')
    1805                 :   {
    1806              12 :   int temperrorcode = 0;
    1807              12 :   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
    1808              12 :   if (temperrorcode != 0) return FALSE;
    1809              12 :   ptr++;    /* Point after the escape sequence */
    1810                 :   }
    1811                 : 
    1812             482 : else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
    1813                 :   {
    1814                 : #ifdef SUPPORT_UTF8
    1815             448 :   if (utf8) { GETCHARINC(next, ptr); } else
    1816                 : #endif
    1817             448 :   next = *ptr++;
    1818                 :   }
    1819                 : 
    1820              34 : else return FALSE;
    1821                 : 
    1822                 : /* Skip whitespace and comments in extended mode */
    1823                 : 
    1824             460 : if ((options & PCRE_EXTENDED) != 0)
    1825                 :   {
    1826                 :   for (;;)
    1827                 :     {
    1828               0 :     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
    1829               0 :     if (*ptr == '#')
    1830                 :       {
    1831               0 :       while (*(++ptr) != 0)
    1832               0 :         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
    1833                 :       }
    1834               0 :     else break;
    1835               0 :     }
    1836                 :   }
    1837                 : 
    1838                 : /* If the next thing is itself optional, we have to give up. */
    1839                 : 
    1840             460 : if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
    1841               0 :   return FALSE;
    1842                 : 
    1843                 : /* Now compare the next item with the previous opcode. If the previous is a
    1844                 : positive single character match, "item" either contains the character or, if
    1845                 : "item" is greater than 127 in utf8 mode, the character's bytes are in
    1846                 : utf8_char. */
    1847                 : 
    1848                 : 
    1849                 : /* Handle cases when the next item is a character. */
    1850                 : 
    1851             460 : if (next >= 0) switch(op_code)
    1852                 :   {
    1853                 :   case OP_CHAR:
    1854                 : #ifdef SUPPORT_UTF8
    1855               6 :   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
    1856                 : #endif
    1857               6 :   return item != next;
    1858                 : 
    1859                 :   /* For CHARNC (caseless character) we must check the other case. If we have
    1860                 :   Unicode property support, we can use it to test the other case of
    1861                 :   high-valued characters. */
    1862                 : 
    1863                 :   case OP_CHARNC:
    1864                 : #ifdef SUPPORT_UTF8
    1865               0 :   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
    1866                 : #endif
    1867               0 :   if (item == next) return FALSE;
    1868                 : #ifdef SUPPORT_UTF8
    1869               0 :   if (utf8)
    1870                 :     {
    1871                 :     unsigned int othercase;
    1872               0 :     if (next < 128) othercase = cd->fcc[next]; else
    1873                 : #ifdef SUPPORT_UCP
    1874               0 :     othercase = _pcre_ucp_othercase((unsigned int)next);
    1875                 : #else
    1876                 :     othercase = NOTACHAR;
    1877                 : #endif
    1878               0 :     return (unsigned int)item != othercase;
    1879                 :     }
    1880                 :   else
    1881                 : #endif  /* SUPPORT_UTF8 */
    1882               0 :   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
    1883                 : 
    1884                 :   /* For OP_NOT, "item" must be a single-byte character. */
    1885                 : 
    1886                 :   case OP_NOT:
    1887               0 :   if (next < 0) return FALSE;  /* Not a character */
    1888               0 :   if (item == next) return TRUE;
    1889               0 :   if ((options & PCRE_CASELESS) == 0) return FALSE;
    1890                 : #ifdef SUPPORT_UTF8
    1891               0 :   if (utf8)
    1892                 :     {
    1893                 :     unsigned int othercase;
    1894               0 :     if (next < 128) othercase = cd->fcc[next]; else
    1895                 : #ifdef SUPPORT_UCP
    1896               0 :     othercase = _pcre_ucp_othercase(next);
    1897                 : #else
    1898                 :     othercase = NOTACHAR;
    1899                 : #endif
    1900               0 :     return (unsigned int)item == othercase;
    1901                 :     }
    1902                 :   else
    1903                 : #endif  /* SUPPORT_UTF8 */
    1904               0 :   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
    1905                 : 
    1906                 :   case OP_DIGIT:
    1907               0 :   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
    1908                 : 
    1909                 :   case OP_NOT_DIGIT:
    1910               0 :   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
    1911                 : 
    1912                 :   case OP_WHITESPACE:
    1913               2 :   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
    1914                 : 
    1915                 :   case OP_NOT_WHITESPACE:
    1916               0 :   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
    1917                 : 
    1918                 :   case OP_WORDCHAR:
    1919               0 :   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
    1920                 : 
    1921                 :   case OP_NOT_WORDCHAR:
    1922               0 :   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
    1923                 : 
    1924                 :   default:
    1925             452 :   return FALSE;
    1926                 :   }
    1927                 : 
    1928                 : 
    1929                 : /* Handle the case when the next item is \d, \s, etc. */
    1930                 : 
    1931               0 : switch(op_code)
    1932                 :   {
    1933                 :   case OP_CHAR:
    1934                 :   case OP_CHARNC:
    1935                 : #ifdef SUPPORT_UTF8
    1936               0 :   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
    1937                 : #endif
    1938               0 :   switch(-next)
    1939                 :     {
    1940                 :     case ESC_d:
    1941               0 :     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
    1942                 : 
    1943                 :     case ESC_D:
    1944               0 :     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
    1945                 : 
    1946                 :     case ESC_s:
    1947               0 :     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
    1948                 : 
    1949                 :     case ESC_S:
    1950               0 :     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
    1951                 : 
    1952                 :     case ESC_w:
    1953               0 :     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
    1954                 : 
    1955                 :     case ESC_W:
    1956               0 :     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
    1957                 : 
    1958                 :     default:
    1959               0 :     return FALSE;
    1960                 :     }
    1961                 : 
    1962                 :   case OP_DIGIT:
    1963               0 :   return next == -ESC_D || next == -ESC_s || next == -ESC_W;
    1964                 : 
    1965                 :   case OP_NOT_DIGIT:
    1966               0 :   return next == -ESC_d;
    1967                 : 
    1968                 :   case OP_WHITESPACE:
    1969               0 :   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
    1970                 : 
    1971                 :   case OP_NOT_WHITESPACE:
    1972               0 :   return next == -ESC_s;
    1973                 : 
    1974                 :   case OP_WORDCHAR:
    1975               0 :   return next == -ESC_W || next == -ESC_s;
    1976                 : 
    1977                 :   case OP_NOT_WORDCHAR:
    1978               0 :   return next == -ESC_w || next == -ESC_d;
    1979                 : 
    1980                 :   default:
    1981               0 :   return FALSE;
    1982                 :   }
    1983                 : 
    1984                 : /* Control does not reach here */
    1985                 : }
    1986                 : 
    1987                 : 
    1988                 : 
    1989                 : /*************************************************
    1990                 : *           Compile one branch                   *
    1991                 : *************************************************/
    1992                 : 
    1993                 : /* Scan the pattern, compiling it into the a vector. If the options are
    1994                 : changed during the branch, the pointer is used to change the external options
    1995                 : bits. This function is used during the pre-compile phase when we are trying
    1996                 : to find out the amount of memory needed, as well as during the real compile
    1997                 : phase. The value of lengthptr distinguishes the two phases.
    1998                 : 
    1999                 : Arguments:
    2000                 :   optionsptr     pointer to the option bits
    2001                 :   codeptr        points to the pointer to the current code point
    2002                 :   ptrptr         points to the current pattern pointer
    2003                 :   errorcodeptr   points to error code variable
    2004                 :   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
    2005                 :   reqbyteptr     set to the last literal character required, else < 0
    2006                 :   bcptr          points to current branch chain
    2007                 :   cd             contains pointers to tables etc.
    2008                 :   lengthptr      NULL during the real compile phase
    2009                 :                  points to length accumulator during pre-compile phase
    2010                 : 
    2011                 : Returns:         TRUE on success
    2012                 :                  FALSE, with *errorcodeptr set non-zero on error
    2013                 : */
    2014                 : 
    2015                 : static BOOL
    2016                 : compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
    2017                 :   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
    2018                 :   compile_data *cd, int *lengthptr)
    2019             406 : {
    2020                 : int repeat_type, op_type;
    2021             406 : int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
    2022             406 : int bravalue = 0;
    2023                 : int greedy_default, greedy_non_default;
    2024                 : int firstbyte, reqbyte;
    2025                 : int zeroreqbyte, zerofirstbyte;
    2026                 : int req_caseopt, reqvary, tempreqvary;
    2027             406 : int options = *optionsptr;
    2028             406 : int after_manual_callout = 0;
    2029             406 : int length_prevgroup = 0;
    2030                 : register int c;
    2031             406 : register uschar *code = *codeptr;
    2032             406 : uschar *last_code = code;
    2033             406 : uschar *orig_code = code;
    2034                 : uschar *tempcode;
    2035             406 : BOOL inescq = FALSE;
    2036             406 : BOOL groupsetfirstbyte = FALSE;
    2037             406 : const uschar *ptr = *ptrptr;
    2038                 : const uschar *tempptr;
    2039             406 : uschar *previous = NULL;
    2040             406 : uschar *previous_callout = NULL;
    2041             406 : uschar *save_hwm = NULL;
    2042                 : uschar classbits[32];
    2043                 : 
    2044                 : #ifdef SUPPORT_UTF8
    2045                 : BOOL class_utf8;
    2046             406 : BOOL utf8 = (options & PCRE_UTF8) != 0;
    2047                 : uschar *class_utf8data;
    2048                 : uschar utf8_char[6];
    2049                 : #else
    2050                 : BOOL utf8 = FALSE;
    2051                 : uschar *utf8_char = NULL;
    2052                 : #endif
    2053                 : 
    2054                 : #ifdef DEBUG
    2055                 : if (lengthptr != NULL) DPRINTF((">> start branch\n"));
    2056                 : #endif
    2057                 : 
    2058                 : /* Set up the default and non-default settings for greediness */
    2059                 : 
    2060             406 : greedy_default = ((options & PCRE_UNGREEDY) != 0);
    2061             406 : greedy_non_default = greedy_default ^ 1;
    2062                 : 
    2063                 : /* Initialize no first byte, no required byte. REQ_UNSET means "no char
    2064                 : matching encountered yet". It gets changed to REQ_NONE if we hit something that
    2065                 : matches a non-fixed char first char; reqbyte just remains unset if we never
    2066                 : find one.
    2067                 : 
    2068                 : When we hit a repeat whose minimum is zero, we may have to adjust these values
    2069                 : to take the zero repeat into account. This is implemented by setting them to
    2070                 : zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
    2071                 : item types that can be repeated set these backoff variables appropriately. */
    2072                 : 
    2073             406 : firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
    2074                 : 
    2075                 : /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
    2076                 : according to the current setting of the caseless flag. REQ_CASELESS is a bit
    2077                 : value > 255. It is added into the firstbyte or reqbyte variables to record the
    2078                 : case status of the value. This is used only for ASCII characters. */
    2079                 : 
    2080             406 : req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
    2081                 : 
    2082                 : /* Switch on next character until the end of the branch */
    2083                 : 
    2084           73508 : for (;; ptr++)
    2085                 :   {
    2086                 :   BOOL negate_class;
    2087                 :   BOOL possessive_quantifier;
    2088                 :   BOOL is_quantifier;
    2089                 :   BOOL is_recurse;
    2090                 :   int class_charcount;
    2091                 :   int class_lastchar;
    2092                 :   int newoptions;
    2093                 :   int recno;
    2094                 :   int skipbytes;
    2095                 :   int subreqbyte;
    2096                 :   int subfirstbyte;
    2097                 :   int terminator;
    2098                 :   int mclength;
    2099                 :   uschar mcbuffer[8];
    2100                 : 
    2101                 :   /* Get next byte in the pattern */
    2102                 : 
    2103           73914 :   c = *ptr;
    2104                 : 
    2105                 :   /* If we are in the pre-compile phase, accumulate the length used for the
    2106                 :   previous cycle of this loop. */
    2107                 : 
    2108           73914 :   if (lengthptr != NULL)
    2109                 :     {
    2110                 : #ifdef DEBUG
    2111                 :     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
    2112                 : #endif
    2113           36957 :     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
    2114                 :       {
    2115               0 :       *errorcodeptr = ERR52;
    2116               0 :       goto FAILED;
    2117                 :       }
    2118                 : 
    2119                 :     /* There is at least one situation where code goes backwards: this is the
    2120                 :     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
    2121                 :     the class is simply eliminated. However, it is created first, so we have to
    2122                 :     allow memory for it. Therefore, don't ever reduce the length at this point.
    2123                 :     */
    2124                 : 
    2125           36957 :     if (code < last_code) code = last_code;
    2126           36957 :     *lengthptr += code - last_code;
    2127                 :     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
    2128                 : 
    2129                 :     /* If "previous" is set and it is not at the start of the work space, move
    2130                 :     it back to there, in order to avoid filling up the work space. Otherwise,
    2131                 :     if "previous" is NULL, reset the current code pointer to the start. */
    2132                 : 
    2133           36957 :     if (previous != NULL)
    2134                 :       {
    2135           35760 :       if (previous > orig_code)
    2136                 :         {
    2137           34961 :         memmove(orig_code, previous, code - previous);
    2138           34961 :         code -= previous - orig_code;
    2139           34961 :         previous = orig_code;
    2140                 :         }
    2141                 :       }
    2142            1197 :     else code = orig_code;
    2143                 : 
    2144                 :     /* Remember where this code item starts so we can pick up the length
    2145                 :     next time round. */
    2146                 : 
    2147           36957 :     last_code = code;
    2148                 :     }
    2149                 : 
    2150                 :   /* In the real compile phase, just check the workspace used by the forward
    2151                 :   reference list. */
    2152                 : 
    2153           36957 :   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
    2154                 :     {
    2155               0 :     *errorcodeptr = ERR52;
    2156               0 :     goto FAILED;
    2157                 :     }
    2158                 : 
    2159                 :   /* If in \Q...\E, check for the end; if not, we have a literal */
    2160                 : 
    2161           73914 :   if (inescq && c != 0)
    2162                 :     {
    2163               0 :     if (c == '\\' && ptr[1] == 'E')
    2164                 :       {
    2165               0 :       inescq = FALSE;
    2166               0 :       ptr++;
    2167               0 :       continue;
    2168                 :       }
    2169                 :     else
    2170                 :       {
    2171               0 :       if (previous_callout != NULL)
    2172                 :         {
    2173               0 :         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
    2174               0 :           complete_callout(previous_callout, ptr, cd);
    2175               0 :         previous_callout = NULL;
    2176                 :         }
    2177               0 :       if ((options & PCRE_AUTO_CALLOUT) != 0)
    2178                 :         {
    2179               0 :         previous_callout = code;
    2180               0 :         code = auto_callout(code, ptr, cd);
    2181                 :         }
    2182               0 :       goto NORMAL_CHAR;
    2183                 :       }
    2184                 :     }
    2185                 : 
    2186                 :   /* Fill in length of a previous callout, except when the next thing is
    2187                 :   a quantifier. */
    2188                 : 
    2189           73914 :   is_quantifier = c == '*' || c == '+' || c == '?' ||
    2190                 :     (c == '{' && is_counted_repeat(ptr+1));
    2191                 : 
    2192           73914 :   if (!is_quantifier && previous_callout != NULL &&
    2193                 :        after_manual_callout-- <= 0)
    2194                 :     {
    2195               0 :     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
    2196               0 :       complete_callout(previous_callout, ptr, cd);
    2197               0 :     previous_callout = NULL;
    2198                 :     }
    2199                 : 
    2200                 :   /* In extended mode, skip white space and comments */
    2201                 : 
    2202           73914 :   if ((options & PCRE_EXTENDED) != 0)
    2203                 :     {
    2204               0 :     if ((cd->ctypes[c] & ctype_space) != 0) continue;
    2205               0 :     if (c == '#')
    2206                 :       {
    2207               0 :       while (*(++ptr) != 0)
    2208                 :         {
    2209               0 :         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
    2210                 :         }
    2211               0 :       if (*ptr != 0) continue;
    2212                 : 
    2213                 :       /* Else fall through to handle end of string */
    2214               0 :       c = 0;
    2215                 :       }
    2216                 :     }
    2217                 : 
    2218                 :   /* No auto callout for quantifiers. */
    2219                 : 
    2220           73914 :   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
    2221                 :     {
    2222               0 :     previous_callout = code;
    2223               0 :     code = auto_callout(code, ptr, cd);
    2224                 :     }
    2225                 : 
    2226           73914 :   switch(c)
    2227                 :     {
    2228                 :     /* ===================================================================*/
    2229                 :     case 0:                        /* The branch terminates at string end */
    2230                 :     case '|':                      /* or | or ) */
    2231                 :     case ')':
    2232             406 :     *firstbyteptr = firstbyte;
    2233             406 :     *reqbyteptr = reqbyte;
    2234             406 :     *codeptr = code;
    2235             406 :     *ptrptr = ptr;
    2236             406 :     if (lengthptr != NULL)
    2237                 :       {
    2238             203 :       *lengthptr += code - last_code;   /* To include callout length */
    2239                 :       DPRINTF((">> end branch\n"));
    2240                 :       }
    2241             406 :     return TRUE;
    2242                 : 
    2243                 : 
    2244                 :     /* ===================================================================*/
    2245                 :     /* Handle single-character metacharacters. In multiline mode, ^ disables
    2246                 :     the setting of any following char as a first character. */
    2247                 : 
    2248                 :     case '^':
    2249             364 :     if ((options & PCRE_MULTILINE) != 0)
    2250                 :       {
    2251               0 :       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    2252                 :       }
    2253             364 :     previous = NULL;
    2254             364 :     *code++ = OP_CIRC;
    2255             364 :     break;
    2256                 : 
    2257                 :     case '$':
    2258             360 :     previous = NULL;
    2259             360 :     *code++ = OP_DOLL;
    2260             360 :     break;
    2261                 : 
    2262                 :     /* There can never be a first char if '.' is first, whatever happens about
    2263                 :     repeats. The value of reqbyte doesn't change either. */
    2264                 : 
    2265                 :     case '.':
    2266             482 :     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    2267             482 :     zerofirstbyte = firstbyte;
    2268             482 :     zeroreqbyte = reqbyte;
    2269             482 :     previous = code;
    2270             482 :     *code++ = OP_ANY;
    2271             482 :     break;
    2272                 : 
    2273                 : 
    2274                 :     /* ===================================================================*/
    2275                 :     /* Character classes. If the included characters are all < 256, we build a
    2276                 :     32-byte bitmap of the permitted characters, except in the special case
    2277                 :     where there is only one such character. For negated classes, we build the
    2278                 :     map as usual, then invert it at the end. However, we use a different opcode
    2279                 :     so that data characters > 255 can be handled correctly.
    2280                 : 
    2281                 :     If the class contains characters outside the 0-255 range, a different
    2282                 :     opcode is compiled. It may optionally have a bit map for characters < 256,
    2283                 :     but those above are are explicitly listed afterwards. A flag byte tells
    2284                 :     whether the bitmap is present, and whether this is a negated class or not.
    2285                 :     */
    2286                 : 
    2287                 :     case '[':
    2288             634 :     previous = code;
    2289                 : 
    2290                 :     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
    2291                 :     they are encountered at the top level, so we'll do that too. */
    2292                 : 
    2293             634 :     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
    2294                 :         check_posix_syntax(ptr, &tempptr, cd))
    2295                 :       {
    2296               0 :       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
    2297               0 :       goto FAILED;
    2298                 :       }
    2299                 : 
    2300                 :     /* If the first character is '^', set the negation flag and skip it. */
    2301                 : 
    2302             634 :     if ((c = *(++ptr)) == '^')
    2303                 :       {
    2304               0 :       negate_class = TRUE;
    2305               0 :       c = *(++ptr);
    2306                 :       }
    2307                 :     else
    2308                 :       {
    2309             634 :       negate_class = FALSE;
    2310                 :       }
    2311                 : 
    2312                 :     /* Keep a count of chars with values < 256 so that we can optimize the case
    2313                 :     of just a single character (as long as it's < 256). However, For higher
    2314                 :     valued UTF-8 characters, we don't yet do any optimization. */
    2315                 : 
    2316             634 :     class_charcount = 0;
    2317             634 :     class_lastchar = -1;
    2318                 : 
    2319                 :     /* Initialize the 32-char bit map to all zeros. We build the map in a
    2320                 :     temporary bit of memory, in case the class contains only 1 character (less
    2321                 :     than 256), because in that case the compiled code doesn't use the bit map.
    2322                 :     */
    2323                 : 
    2324             634 :     memset(classbits, 0, 32 * sizeof(uschar));
    2325                 : 
    2326                 : #ifdef SUPPORT_UTF8
    2327             634 :     class_utf8 = FALSE;                       /* No chars >= 256 */
    2328             634 :     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
    2329                 : #endif
    2330                 : 
    2331                 :     /* Process characters until ] is reached. By writing this as a "do" it
    2332                 :     means that an initial ] is taken as a data character. At the start of the
    2333                 :     loop, c contains the first byte of the character. */
    2334                 : 
    2335             634 :     if (c != 0) do
    2336                 :       {
    2337                 :       const uschar *oldptr;
    2338                 : 
    2339                 : #ifdef SUPPORT_UTF8
    2340             672 :       if (utf8 && c > 127)
    2341                 :         {                           /* Braces are required because the */
    2342               0 :         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
    2343                 :         }
    2344                 : #endif
    2345                 : 
    2346                 :       /* Inside \Q...\E everything is literal except \E */
    2347                 : 
    2348             672 :       if (inescq)
    2349                 :         {
    2350               0 :         if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
    2351                 :           {
    2352               0 :           inescq = FALSE;                   /* Reset literal state */
    2353               0 :           ptr++;                            /* Skip the 'E' */
    2354               0 :           continue;                         /* Carry on with next */
    2355                 :           }
    2356               0 :         goto CHECK_RANGE;                   /* Could be range if \E follows */
    2357                 :         }
    2358                 : 
    2359                 :       /* Handle POSIX class names. Perl allows a negation extension of the
    2360                 :       form [:^name:]. A square bracket that doesn't match the syntax is
    2361                 :       treated as a literal. We also recognize the POSIX constructions
    2362                 :       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
    2363                 :       5.6 and 5.8 do. */
    2364                 : 
    2365             672 :       if (c == '[' &&
    2366                 :           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
    2367                 :           check_posix_syntax(ptr, &tempptr, cd))
    2368                 :         {
    2369               0 :         BOOL local_negate = FALSE;
    2370                 :         int posix_class, taboffset, tabopt;
    2371               0 :         register const uschar *cbits = cd->cbits;
    2372                 :         uschar pbits[32];
    2373                 : 
    2374               0 :         if (ptr[1] != ':')
    2375                 :           {
    2376               0 :           *errorcodeptr = ERR31;
    2377               0 :           goto FAILED;
    2378                 :           }
    2379                 : 
    2380               0 :         ptr += 2;
    2381               0 :         if (*ptr == '^')
    2382                 :           {
    2383               0 :           local_negate = TRUE;
    2384               0 :           ptr++;
    2385                 :           }
    2386                 : 
    2387               0 :         posix_class = check_posix_name(ptr, tempptr - ptr);
    2388               0 :         if (posix_class < 0)
    2389                 :           {
    2390               0 :           *errorcodeptr = ERR30;
    2391               0 :           goto FAILED;
    2392                 :           }
    2393                 : 
    2394                 :         /* If matching is caseless, upper and lower are converted to
    2395                 :         alpha. This relies on the fact that the class table starts with
    2396                 :         alpha, lower, upper as the first 3 entries. */
    2397                 : 
    2398               0 :         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
    2399               0 :           posix_class = 0;
    2400                 : 
    2401                 :         /* We build the bit map for the POSIX class in a chunk of local store
    2402                 :         because we may be adding and subtracting from it, and we don't want to
    2403                 :         subtract bits that may be in the main map already. At the end we or the
    2404                 :         result into the bit map that is being built. */
    2405                 : 
    2406               0 :         posix_class *= 3;
    2407                 : 
    2408                 :         /* Copy in the first table (always present) */
    2409                 : 
    2410               0 :         memcpy(pbits, cbits + posix_class_maps[posix_class],
    2411                 :           32 * sizeof(uschar));
    2412                 : 
    2413                 :         /* If there is a second table, add or remove it as required. */
    2414                 : 
    2415               0 :         taboffset = posix_class_maps[posix_class + 1];
    2416               0 :         tabopt = posix_class_maps[posix_class + 2];
    2417                 : 
    2418               0 :         if (taboffset >= 0)
    2419                 :           {
    2420               0 :           if (tabopt >= 0)
    2421               0 :             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
    2422                 :           else
    2423               0 :             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
    2424                 :           }
    2425                 : 
    2426                 :         /* Not see if we need to remove any special characters. An option
    2427                 :         value of 1 removes vertical space and 2 removes underscore. */
    2428                 : 
    2429               0 :         if (tabopt < 0) tabopt = -tabopt;
    2430               0 :         if (tabopt == 1) pbits[1] &= ~0x3c;
    2431               0 :           else if (tabopt == 2) pbits[11] &= 0x7f;
    2432                 : 
    2433                 :         /* Add the POSIX table or its complement into the main table that is
    2434                 :         being built and we are done. */
    2435                 : 
    2436               0 :         if (local_negate)
    2437               0 :           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
    2438                 :         else
    2439               0 :           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
    2440                 : 
    2441               0 :         ptr = tempptr + 1;
    2442               0 :         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
    2443               0 :         continue;    /* End of POSIX syntax handling */
    2444                 :         }
    2445                 : 
    2446                 :       /* Backslash may introduce a single character, or it may introduce one
    2447                 :       of the specials, which just set a flag. The sequence \b is a special
    2448                 :       case. Inside a class (and only there) it is treated as backspace.
    2449                 :       Elsewhere it marks a word boundary. Other escapes have preset maps ready
    2450                 :       to or into the one we are building. We assume they have more than one
    2451                 :       character in them, so set class_charcount bigger than one. */
    2452                 : 
    2453             672 :       if (c == '\\')
    2454                 :         {
    2455              34 :         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
    2456              34 :         if (*errorcodeptr != 0) goto FAILED;
    2457                 : 
    2458              34 :         if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
    2459              34 :         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
    2460              34 :         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
    2461              34 :         else if (-c == ESC_Q)            /* Handle start of quoted string */
    2462                 :           {
    2463               0 :           if (ptr[1] == '\\' && ptr[2] == 'E')
    2464                 :             {
    2465               0 :             ptr += 2; /* avoid empty string */
    2466                 :             }
    2467               0 :           else inescq = TRUE;
    2468               0 :           continue;
    2469                 :           }
    2470                 : 
    2471              34 :         if (c < 0)
    2472                 :           {
    2473               0 :           register const uschar *cbits = cd->cbits;
    2474               0 :           class_charcount += 2;     /* Greater than 1 is what matters */
    2475                 : 
    2476                 :           /* Save time by not doing this in the pre-compile phase. */
    2477                 : 
    2478               0 :           if (lengthptr == NULL) switch (-c)
    2479                 :             {
    2480                 :             case ESC_d:
    2481               0 :             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
    2482               0 :             continue;
    2483                 : 
    2484                 :             case ESC_D:
    2485               0 :             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
    2486               0 :             continue;
    2487                 : 
    2488                 :             case ESC_w:
    2489               0 :             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
    2490               0 :             continue;
    2491                 : 
    2492                 :             case ESC_W:
    2493               0 :             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
    2494               0 :             continue;
    2495                 : 
    2496                 :             case ESC_s:
    2497               0 :             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
    2498               0 :             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
    2499               0 :             continue;
    2500                 : 
    2501                 :             case ESC_S:
    2502               0 :             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
    2503               0 :             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
    2504               0 :             continue;
    2505                 : 
    2506                 :             case ESC_E: /* Perl ignores an orphan \E */
    2507               0 :             continue;
    2508                 : 
    2509                 :             default:    /* Not recognized; fall through */
    2510                 :             break;      /* Need "default" setting to stop compiler warning. */
    2511                 :             }
    2512                 : 
    2513                 :           /* In the pre-compile phase, just do the recognition. */
    2514                 : 
    2515               0 :           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
    2516                 :                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
    2517                 : 
    2518                 :           /* We need to deal with \P and \p in both phases. */
    2519                 : 
    2520                 : #ifdef SUPPORT_UCP
    2521               0 :           if (-c == ESC_p || -c == ESC_P)
    2522                 :             {
    2523                 :             BOOL negated;
    2524                 :             int pdata;
    2525               0 :             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
    2526               0 :             if (ptype < 0) goto FAILED;
    2527               0 :             class_utf8 = TRUE;
    2528               0 :             *class_utf8data++ = ((-c == ESC_p) != negated)?
    2529                 :               XCL_PROP : XCL_NOTPROP;
    2530               0 :             *class_utf8data++ = ptype;
    2531               0 :             *class_utf8data++ = pdata;
    2532               0 :             class_charcount -= 2;   /* Not a < 256 character */
    2533               0 :             continue;
    2534                 :             }
    2535                 : #endif
    2536                 :           /* Unrecognized escapes are faulted if PCRE is running in its
    2537                 :           strict mode. By default, for compatibility with Perl, they are
    2538                 :           treated as literals. */
    2539                 : 
    2540               0 :           if ((options & PCRE_EXTRA) != 0)
    2541                 :             {
    2542               0 :             *errorcodeptr = ERR7;
    2543               0 :             goto FAILED;
    2544                 :             }
    2545                 : 
    2546               0 :           class_charcount -= 2;  /* Undo the default count from above */
    2547               0 :           c = *ptr;              /* Get the final character and fall through */
    2548                 :           }
    2549                 : 
    2550                 :         /* Fall through if we have a single character (c >= 0). This may be
    2551                 :         greater than 256 in UTF-8 mode. */
    2552                 : 
    2553                 :         }   /* End of backslash handling */
    2554                 : 
    2555                 :       /* A single character may be followed by '-' to form a range. However,
    2556                 :       Perl does not permit ']' to be the end of the range. A '-' character
    2557                 :       at the end is treated as a literal. Perl ignores orphaned \E sequences
    2558                 :       entirely. The code for handling \Q and \E is messy. */
    2559                 : 
    2560             672 :       CHECK_RANGE:
    2561            1344 :       while (ptr[1] == '\\' && ptr[2] == 'E')
    2562                 :         {
    2563               0 :         inescq = FALSE;
    2564               0 :         ptr += 2;
    2565                 :         }
    2566                 : 
    2567             672 :       oldptr = ptr;
    2568                 : 
    2569             672 :       if (!inescq && ptr[1] == '-')
    2570                 :         {
    2571                 :         int d;
    2572             598 :         ptr += 2;
    2573             598 :         while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
    2574                 : 
    2575                 :         /* If we hit \Q (not followed by \E) at this point, go into escaped
    2576                 :         mode. */
    2577                 : 
    2578            1196 :         while (*ptr == '\\' && ptr[1] == 'Q')
    2579                 :           {
    2580               0 :           ptr += 2;
    2581               0 :           if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
    2582               0 :           inescq = TRUE;
    2583               0 :           break;
    2584                 :           }
    2585                 : 
    2586             598 :         if (*ptr == 0 || (!inescq && *ptr == ']'))
    2587                 :           {
    2588               0 :           ptr = oldptr;
    2589               0 :           goto LONE_SINGLE_CHARACTER;
    2590                 :           }
    2591                 : 
    2592                 : #ifdef SUPPORT_UTF8
    2593             598 :         if (utf8)
    2594                 :           {                           /* Braces are required because the */
    2595               0 :           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
    2596                 :           }
    2597                 :         else
    2598                 : #endif
    2599             598 :         d = *ptr;  /* Not UTF-8 mode */
    2600                 : 
    2601                 :         /* The second part of a range can be a single-character escape, but
    2602                 :         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
    2603                 :         in such circumstances. */
    2604                 : 
    2605             598 :         if (!inescq && d == '\\')
    2606                 :           {
    2607               0 :           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
    2608               0 :           if (*errorcodeptr != 0) goto FAILED;
    2609                 : 
    2610                 :           /* \b is backslash; \X is literal X; \R is literal R; any other
    2611                 :           special means the '-' was literal */
    2612                 : 
    2613               0 :           if (d < 0)
    2614                 :             {
    2615               0 :             if (d == -ESC_b) d = '\b';
    2616               0 :             else if (d == -ESC_X) d = 'X';
    2617               0 :             else if (d == -ESC_R) d = 'R'; else
    2618                 :               {
    2619               0 :               ptr = oldptr;
    2620               0 :               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
    2621                 :               }
    2622                 :             }
    2623                 :           }
    2624                 : 
    2625                 :         /* Check that the two values are in the correct order. Optimize
    2626                 :         one-character ranges */
    2627                 : 
    2628             598 :         if (d < c)
    2629                 :           {
    2630               0 :           *errorcodeptr = ERR8;
    2631               0 :           goto FAILED;
    2632                 :           }
    2633                 : 
    2634             598 :         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
    2635                 : 
    2636                 :         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
    2637                 :         matching, we have to use an XCLASS with extra data items. Caseless
    2638                 :         matching for characters > 127 is available only if UCP support is
    2639                 :         available. */
    2640                 : 
    2641                 : #ifdef SUPPORT_UTF8
    2642             598 :         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
    2643                 :           {
    2644               0 :           class_utf8 = TRUE;
    2645                 : 
    2646                 :           /* With UCP support, we can find the other case equivalents of
    2647                 :           the relevant characters. There may be several ranges. Optimize how
    2648                 :           they fit with the basic range. */
    2649                 : 
    2650                 : #ifdef SUPPORT_UCP
    2651               0 :           if ((options & PCRE_CASELESS) != 0)
    2652                 :             {
    2653                 :             unsigned int occ, ocd;
    2654               0 :             unsigned int cc = c;
    2655               0 :             unsigned int origd = d;
    2656               0 :             while (get_othercase_range(&cc, origd, &occ, &ocd))
    2657                 :               {
    2658               0 :               if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
    2659                 : 
    2660               0 :               if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
    2661                 :                 {                                  /* if there is overlap,   */
    2662               0 :                 c = occ;                           /* noting that if occ < c */
    2663               0 :                 continue;                          /* we can't have ocd > d  */
    2664                 :                 }                                  /* because a subrange is  */
    2665               0 :               if (ocd > d && occ <= d + 1)         /* always shorter than    */
    2666                 :                 {                                  /* the basic range.       */
    2667               0 :                 d = ocd;
    2668               0 :                 continue;
    2669                 :                 }
    2670                 : 
    2671               0 :               if (occ == ocd)
    2672                 :                 {
    2673               0 :                 *class_utf8data++ = XCL_SINGLE;
    2674                 :                 }
    2675                 :               else
    2676                 :                 {
    2677               0 :                 *class_utf8data++ = XCL_RANGE;
    2678               0 :                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
    2679                 :                 }
    2680               0 :               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
    2681                 :               }
    2682                 :             }
    2683                 : #endif  /* SUPPORT_UCP */
    2684                 : 
    2685                 :           /* Now record the original range, possibly modified for UCP caseless
    2686                 :           overlapping ranges. */
    2687                 : 
    2688               0 :           *class_utf8data++ = XCL_RANGE;
    2689               0 :           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
    2690               0 :           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
    2691                 : 
    2692                 :           /* With UCP support, we are done. Without UCP support, there is no
    2693                 :           caseless matching for UTF-8 characters > 127; we can use the bit map
    2694                 :           for the smaller ones. */
    2695                 : 
    2696                 : #ifdef SUPPORT_UCP
    2697               0 :           continue;    /* With next character in the class */
    2698                 : #else
    2699                 :           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
    2700                 : 
    2701                 :           /* Adjust upper limit and fall through to set up the map */
    2702                 : 
    2703                 :           d = 127;
    2704                 : 
    2705                 : #endif  /* SUPPORT_UCP */
    2706                 :           }
    2707                 : #endif  /* SUPPORT_UTF8 */
    2708                 : 
    2709                 :         /* We use the bit map for all cases when not in UTF-8 mode; else
    2710                 :         ranges that lie entirely within 0-127 when there is UCP support; else
    2711                 :         for partial ranges without UCP support. */
    2712                 : 
    2713             598 :         class_charcount += d - c + 1;
    2714             598 :         class_lastchar = d;
    2715                 : 
    2716                 :         /* We can save a bit of time by skipping this in the pre-compile. */
    2717                 : 
    2718            3604 :         if (lengthptr == NULL) for (; c <= d; c++)
    2719                 :           {
    2720            3006 :           classbits[c/8] |= (1 << (c&7));
    2721            3006 :           if ((options & PCRE_CASELESS) != 0)
    2722                 :             {
    2723               0 :             int uc = cd->fcc[c];           /* flip case */
    2724               0 :             classbits[uc/8] |= (1 << (uc&7));
    2725                 :             }
    2726                 :           }
    2727                 : 
    2728             598 :         continue;   /* Go get the next char in the class */
    2729                 :         }
    2730                 : 
    2731                 :       /* Handle a lone single character - we can get here for a normal
    2732                 :       non-escape char, or after \ that introduces a single character or for an
    2733                 :       apparent range that isn't. */
    2734                 : 
    2735              74 :       LONE_SINGLE_CHARACTER:
    2736                 : 
    2737                 :       /* Handle a character that cannot go in the bit map */
    2738                 : 
    2739                 : #ifdef SUPPORT_UTF8
    2740              74 :       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
    2741                 :         {
    2742               0 :         class_utf8 = TRUE;
    2743               0 :         *class_utf8data++ = XCL_SINGLE;
    2744               0 :         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
    2745                 : 
    2746                 : #ifdef SUPPORT_UCP
    2747               0 :         if ((options & PCRE_CASELESS) != 0)
    2748                 :           {
    2749                 :           unsigned int othercase;
    2750               0 :           if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
    2751                 :             {
    2752               0 :             *class_utf8data++ = XCL_SINGLE;
    2753               0 :             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
    2754                 :             }
    2755                 :           }
    2756                 : #endif  /* SUPPORT_UCP */
    2757                 : 
    2758                 :         }
    2759                 :       else
    2760                 : #endif  /* SUPPORT_UTF8 */
    2761                 : 
    2762                 :       /* Handle a single-byte character */
    2763                 :         {
    2764              74 :         classbits[c/8] |= (1 << (c&7));
    2765              74 :         if ((options & PCRE_CASELESS) != 0)
    2766                 :           {
    2767               0 :           c = cd->fcc[c];   /* flip case */
    2768               0 :           classbits[c/8] |= (1 << (c&7));
    2769                 :           }
    2770              74 :         class_charcount++;
    2771              74 :         class_lastchar = c;
    2772                 :         }
    2773                 :       }
    2774                 : 
    2775                 :     /* Loop until ']' reached. This "while" is the end of the "do" above. */
    2776                 : 
    2777             672 :     while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
    2778                 : 
    2779             634 :     if (c == 0)                          /* Missing terminating ']' */
    2780                 :       {
    2781               0 :       *errorcodeptr = ERR6;
    2782               0 :       goto FAILED;
    2783                 :       }
    2784                 : 
    2785                 :     /* If class_charcount is 1, we saw precisely one character whose value is
    2786                 :     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
    2787                 :     can optimize the negative case only if there were no characters >= 128
    2788                 :     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
    2789                 :     single-bytes only. This is an historical hangover. Maybe one day we can
    2790                 :     tidy these opcodes to handle multi-byte characters.
    2791                 : 
    2792                 :     The optimization throws away the bit map. We turn the item into a
    2793                 :     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
    2794                 :     that OP_NOT does not support multibyte characters. In the positive case, it
    2795                 :     can cause firstbyte to be set. Otherwise, there can be no first char if
    2796                 :     this item is first, whatever repeat count may follow. In the case of
    2797                 :     reqbyte, save the previous value for reinstating. */
    2798                 : 
    2799                 : #ifdef SUPPORT_UTF8
    2800             634 :     if (class_charcount == 1 &&
    2801                 :           (!utf8 ||
    2802                 :           (!class_utf8 && (!negate_class || class_lastchar < 128))))
    2803                 : 
    2804                 : #else
    2805                 :     if (class_charcount == 1)
    2806                 : #endif
    2807                 :       {
    2808               0 :       zeroreqbyte = reqbyte;
    2809                 : 
    2810                 :       /* The OP_NOT opcode works on one-byte characters only. */
    2811                 : 
    2812               0 :       if (negate_class)
    2813                 :         {
    2814               0 :         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    2815               0 :         zerofirstbyte = firstbyte;
    2816               0 :         *code++ = OP_NOT;
    2817               0 :         *code++ = class_lastchar;
    2818               0 :         break;
    2819                 :         }
    2820                 : 
    2821                 :       /* For a single, positive character, get the value into mcbuffer, and
    2822                 :       then we can handle this with the normal one-character code. */
    2823                 : 
    2824                 : #ifdef SUPPORT_UTF8
    2825               0 :       if (utf8 && class_lastchar > 127)
    2826               0 :         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
    2827                 :       else
    2828                 : #endif
    2829                 :         {
    2830               0 :         mcbuffer[0] = class_lastchar;
    2831               0 :         mclength = 1;
    2832                 :         }
    2833               0 :       goto ONE_CHAR;
    2834                 :       }       /* End of 1-char optimization */
    2835                 : 
    2836                 :     /* The general case - not the one-char optimization. If this is the first
    2837                 :     thing in the branch, there can be no first char setting, whatever the
    2838                 :     repeat count. Any reqbyte setting must remain unchanged after any kind of
    2839                 :     repeat. */
    2840                 : 
    2841             634 :     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    2842             634 :     zerofirstbyte = firstbyte;
    2843             634 :     zeroreqbyte = reqbyte;
    2844                 : 
    2845                 :     /* If there are characters with values > 255, we have to compile an
    2846                 :     extended class, with its own opcode. If there are no characters < 256,
    2847                 :     we can omit the bitmap in the actual compiled code. */
    2848                 : 
    2849                 : #ifdef SUPPORT_UTF8
    2850             634 :     if (class_utf8)
    2851                 :       {
    2852               0 :       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
    2853               0 :       *code++ = OP_XCLASS;
    2854               0 :       code += LINK_SIZE;
    2855               0 :       *code = negate_class? XCL_NOT : 0;
    2856                 : 
    2857                 :       /* If the map is required, move up the extra data to make room for it;
    2858                 :       otherwise just move the code pointer to the end of the extra data. */
    2859                 : 
    2860               0 :       if (class_charcount > 0)
    2861                 :         {
    2862               0 :         *code++ |= XCL_MAP;
    2863               0 :         memmove(code + 32, code, class_utf8data - code);
    2864               0 :         memcpy(code, classbits, 32);
    2865               0 :         code = class_utf8data + 32;
    2866                 :         }
    2867               0 :       else code = class_utf8data;
    2868                 : 
    2869                 :       /* Now fill in the complete length of the item */
    2870                 : 
    2871               0 :       PUT(previous, 1, code - previous);
    2872               0 :       break;   /* End of class handling */
    2873                 :       }
    2874                 : #endif
    2875                 : 
    2876                 :     /* If there are no characters > 255, negate the 32-byte map if necessary,
    2877                 :     and copy it into the code vector. If this is the first thing in the branch,
    2878                 :     there can be no first char setting, whatever the repeat count. Any reqbyte
    2879                 :     setting must remain unchanged after any kind of repeat. */
    2880                 : 
    2881             634 :     if (negate_class)
    2882                 :       {
    2883               0 :       *code++ = OP_NCLASS;
    2884               0 :       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
    2885               0 :         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
    2886                 :       }
    2887                 :     else
    2888                 :       {
    2889             634 :       *code++ = OP_CLASS;
    2890             634 :       memcpy(code, classbits, 32);
    2891                 :       }
    2892             634 :     code += 32;
    2893             634 :     break;
    2894                 : 
    2895                 : 
    2896                 :     /* ===================================================================*/
    2897                 :     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
    2898                 :     has been tested above. */
    2899                 : 
    2900                 :     case '{':
    2901               0 :     if (!is_quantifier) goto NORMAL_CHAR;
    2902               0 :     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
    2903               0 :     if (*errorcodeptr != 0) goto FAILED;
    2904               0 :     goto REPEAT;
    2905                 : 
    2906                 :     case '*':
    2907              46 :     repeat_min = 0;
    2908              46 :     repeat_max = -1;
    2909              46 :     goto REPEAT;
    2910                 : 
    2911                 :     case '+':
    2912            1048 :     repeat_min = 1;
    2913            1048 :     repeat_max = -1;
    2914            1048 :     goto REPEAT;
    2915                 : 
    2916                 :     case '?':
    2917             170 :     repeat_min = 0;
    2918             170 :     repeat_max = 1;
    2919                 : 
    2920            1264 :     REPEAT:
    2921            1264 :     if (previous == NULL)
    2922                 :       {
    2923               0 :       *errorcodeptr = ERR9;
    2924               0 :       goto FAILED;
    2925                 :       }
    2926                 : 
    2927            1264 :     if (repeat_min == 0)
    2928                 :       {
    2929             216 :       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
    2930             216 :       reqbyte = zeroreqbyte;        /* Ditto */
    2931                 :       }
    2932                 : 
    2933                 :     /* Remember whether this is a variable length repeat */
    2934                 : 
    2935            1264 :     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
    2936                 : 
    2937            1264 :     op_type = 0;                    /* Default single-char op codes */
    2938            1264 :     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
    2939                 : 
    2940                 :     /* Save start of previous item, in case we have to move it up to make space
    2941                 :     for an inserted OP_ONCE for the additional '+' extension. */
    2942                 : 
    2943            1264 :     tempcode = previous;
    2944                 : 
    2945                 :     /* If the next character is '+', we have a possessive quantifier. This
    2946                 :     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
    2947                 :     If the next character is '?' this is a minimizing repeat, by default,
    2948                 :     but if PCRE_UNGREEDY is set, it works the other way round. We change the
    2949                 :     repeat type to the non-default. */
    2950                 : 
    2951            1264 :     if (ptr[1] == '+')
    2952                 :       {
    2953               0 :       repeat_type = 0;                  /* Force greedy */
    2954               0 :       possessive_quantifier = TRUE;
    2955               0 :       ptr++;
    2956                 :       }
    2957            1264 :     else if (ptr[1] == '?')
    2958                 :       {
    2959             478 :       repeat_type = greedy_non_default;
    2960             478 :       ptr++;
    2961                 :       }
    2962             786 :     else repeat_type = greedy_default;
    2963                 : 
    2964                 :     /* If previous was a character match, abolish the item and generate a
    2965                 :     repeat item instead. If a char item has a minumum of more than one, ensure
    2966                 :     that it is set in reqbyte - it might not be if a sequence such as x{3} is
    2967                 :     the first thing in a branch because the x will have gone into firstbyte
    2968                 :     instead.  */
    2969                 : 
    2970            1264 :     if (*previous == OP_CHAR || *previous == OP_CHARNC)
    2971                 :       {
    2972                 :       /* Deal with UTF-8 characters that take up more than one byte. It's
    2973                 :       easier to write this out separately than try to macrify it. Use c to
    2974                 :       hold the length of the character in bytes, plus 0x80 to flag that it's a
    2975                 :       length rather than a small character. */
    2976                 : 
    2977                 : #ifdef SUPPORT_UTF8
    2978             108 :       if (utf8 && (code[-1] & 0x80) != 0)
    2979                 :         {
    2980               0 :         uschar *lastchar = code - 1;
    2981               0 :         while((*lastchar & 0xc0) == 0x80) lastchar--;
    2982               0 :         c = code - lastchar;            /* Length of UTF-8 character */
    2983               0 :         memcpy(utf8_char, lastchar, c); /* Save the char */
    2984               0 :         c |= 0x80;                      /* Flag c as a length */
    2985                 :         }
    2986                 :       else
    2987                 : #endif
    2988                 : 
    2989                 :       /* Handle the case of a single byte - either with no UTF8 support, or
    2990                 :       with UTF-8 disabled, or for a UTF-8 character < 128. */
    2991                 : 
    2992                 :         {
    2993             108 :         c = code[-1];
    2994             108 :         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
    2995                 :         }
    2996                 : 
    2997                 :       /* If the repetition is unlimited, it pays to see if the next thing on
    2998                 :       the line is something that cannot possibly match this character. If so,
    2999                 :       automatically possessifying this item gains some performance in the case
    3000                 :       where the match fails. */
    3001                 : 
    3002             108 :       if (!possessive_quantifier &&
    3003                 :           repeat_max < 0 &&
    3004                 :           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
    3005                 :             options, cd))
    3006                 :         {
    3007               6 :         repeat_type = 0;    /* Force greedy */
    3008               6 :         possessive_quantifier = TRUE;
    3009                 :         }
    3010                 : 
    3011             108 :       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
    3012                 :       }
    3013                 : 
    3014                 :     /* If previous was a single negated character ([^a] or similar), we use
    3015                 :     one of the special opcodes, replacing it. The code is shared with single-
    3016                 :     character repeats by setting opt_type to add a suitable offset into
    3017                 :     repeat_type. We can also test for auto-possessification. OP_NOT is
    3018                 :     currently used only for single-byte chars. */
    3019                 : 
    3020            1156 :     else if (*previous == OP_NOT)
    3021                 :       {
    3022               0 :       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
    3023               0 :       c = previous[1];
    3024               0 :       if (!possessive_quantifier &&
    3025                 :           repeat_max < 0 &&
    3026                 :           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
    3027                 :         {
    3028               0 :         repeat_type = 0;    /* Force greedy */
    3029               0 :         possessive_quantifier = TRUE;
    3030                 :         }
    3031               0 :       goto OUTPUT_SINGLE_REPEAT;
    3032                 :       }
    3033                 : 
    3034                 :     /* If previous was a character type match (\d or similar), abolish it and
    3035                 :     create a suitable repeat item. The code is shared with single-character
    3036                 :     repeats by setting op_type to add a suitable offset into repeat_type. Note
    3037                 :     the the Unicode property types will be present only when SUPPORT_UCP is
    3038                 :     defined, but we don't wrap the little bits of code here because it just
    3039                 :     makes it horribly messy. */
    3040                 : 
    3041            1156 :     else if (*previous < OP_EODN)
    3042                 :       {
    3043                 :       uschar *oldcode;
    3044                 :       int prop_type, prop_value;
    3045             488 :       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
    3046             488 :       c = *previous;
    3047                 : 
    3048             488 :       if (!possessive_quantifier &&
    3049                 :           repeat_max < 0 &&
    3050                 :           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
    3051                 :         {
    3052               2 :         repeat_type = 0;    /* Force greedy */
    3053               2 :         possessive_quantifier = TRUE;
    3054                 :         }
    3055                 : 
    3056             596 :       OUTPUT_SINGLE_REPEAT:
    3057             596 :       if (*previous == OP_PROP || *previous == OP_NOTPROP)
    3058                 :         {
    3059               0 :         prop_type = previous[1];
    3060               0 :         prop_value = previous[2];
    3061                 :         }
    3062             596 :       else prop_type = prop_value = -1;
    3063                 : 
    3064             596 :       oldcode = code;
    3065             596 :       code = previous;                  /* Usually overwrite previous item */
    3066                 : 
    3067                 :       /* If the maximum is zero then the minimum must also be zero; Perl allows
    3068                 :       this case, so we do too - by simply omitting the item altogether. */
    3069                 : 
    3070             596 :       if (repeat_max == 0) goto END_REPEAT;
    3071                 : 
    3072                 :       /* All real repeats make it impossible to handle partial matching (maybe
    3073                 :       one day we will be able to remove this restriction). */
    3074                 : 
    3075             596 :       if (repeat_max != 1) cd->nopartial = TRUE;
    3076                 : 
    3077                 :       /* Combine the op_type with the repeat_type */
    3078                 : 
    3079             596 :       repeat_type += op_type;
    3080                 : 
    3081                 :       /* A minimum of zero is handled either as the special case * or ?, or as
    3082                 :       an UPTO, with the maximum given. */
    3083                 : 
    3084             596 :       if (repeat_min == 0)
    3085                 :         {
    3086             114 :         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
    3087             102 :           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
    3088                 :         else
    3089                 :           {
    3090               0 :           *code++ = OP_UPTO + repeat_type;
    3091               0 :           PUT2INC(code, 0, repeat_max);
    3092                 :           }
    3093                 :         }
    3094                 : 
    3095                 :       /* A repeat minimum of 1 is optimized into some special cases. If the
    3096                 :       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
    3097                 :       left in place and, if the maximum is greater than 1, we use OP_UPTO with
    3098                 :       one less than the maximum. */
    3099                 : 
    3100             482 :       else if (repeat_min == 1)
    3101                 :         {
    3102             482 :         if (repeat_max == -1)
    3103             482 :           *code++ = OP_PLUS + repeat_type;
    3104                 :         else
    3105                 :           {
    3106               0 :           code = oldcode;                 /* leave previous item in place */
    3107               0 :           if (repeat_max == 1) goto END_REPEAT;
    3108               0 :           *code++ = OP_UPTO + repeat_type;
    3109               0 :           PUT2INC(code, 0, repeat_max - 1);
    3110                 :           }
    3111                 :         }
    3112                 : 
    3113                 :       /* The case {n,n} is just an EXACT, while the general case {n,m} is
    3114                 :       handled as an EXACT followed by an UPTO. */
    3115                 : 
    3116                 :       else
    3117                 :         {
    3118               0 :         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
    3119               0 :         PUT2INC(code, 0, repeat_min);
    3120                 : 
    3121                 :         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
    3122                 :         we have to insert the character for the previous code. For a repeated
    3123                 :         Unicode property match, there are two extra bytes that define the
    3124                 :         required property. In UTF-8 mode, long characters have their length in
    3125                 :         c, with the 0x80 bit as a flag. */
    3126                 : 
    3127               0 :         if (repeat_max < 0)
    3128                 :           {
    3129                 : #ifdef SUPPORT_UTF8
    3130               0 :           if (utf8 && c >= 128)
    3131                 :             {
    3132               0 :             memcpy(code, utf8_char, c & 7);
    3133               0 :             code += c & 7;
    3134                 :             }
    3135                 :           else
    3136                 : #endif
    3137                 :             {
    3138               0 :             *code++ = c;
    3139               0 :             if (prop_type >= 0)
    3140                 :               {
    3141               0 :               *code++ = prop_type;
    3142               0 :               *code++ = prop_value;
    3143                 :               }
    3144                 :             }
    3145               0 :           *code++ = OP_STAR + repeat_type;
    3146                 :           }
    3147                 : 
    3148                 :         /* Else insert an UPTO if the max is greater than the min, again
    3149                 :         preceded by the character, for the previously inserted code. If the
    3150                 :         UPTO is just for 1 instance, we can use QUERY instead. */
    3151                 : 
    3152               0 :         else if (repeat_max != repeat_min)
    3153                 :           {
    3154                 : #ifdef SUPPORT_UTF8
    3155               0 :           if (utf8 && c >= 128)
    3156                 :             {
    3157               0 :             memcpy(code, utf8_char, c & 7);
    3158               0 :             code += c & 7;
    3159                 :             }
    3160                 :           else
    3161                 : #endif
    3162               0 :           *code++ = c;
    3163               0 :           if (prop_type >= 0)
    3164                 :             {
    3165               0 :             *code++ = prop_type;
    3166               0 :             *code++ = prop_value;
    3167                 :             }
    3168               0 :           repeat_max -= repeat_min;
    3169                 : 
    3170               0 :           if (repeat_max == 1)
    3171                 :             {
    3172               0 :             *code++ = OP_QUERY + repeat_type;
    3173                 :             }
    3174                 :           else
    3175                 :             {
    3176               0 :             *code++ = OP_UPTO + repeat_type;
    3177               0 :             PUT2INC(code, 0, repeat_max);
    3178                 :             }
    3179                 :           }
    3180                 :         }
    3181                 : 
    3182                 :       /* The character or character type itself comes last in all cases. */
    3183                 : 
    3184                 : #ifdef SUPPORT_UTF8
    3185             596 :       if (utf8 && c >= 128)
    3186                 :         {
    3187               0 :         memcpy(code, utf8_char, c & 7);
    3188               0 :         code += c & 7;
    3189                 :         }
    3190                 :       else
    3191                 : #endif
    3192             596 :       *code++ = c;
    3193                 : 
    3194                 :       /* For a repeated Unicode property match, there are two extra bytes that
    3195                 :       define the required property. */
    3196                 : 
    3197                 : #ifdef SUPPORT_UCP
    3198             596 :       if (prop_type >= 0)
    3199                 :         {
    3200               0 :         *code++ = prop_type;
    3201               0 :         *code++ = prop_value;
    3202                 :         }
    3203                 : #endif
    3204                 :       }
    3205                 : 
    3206                 :     /* If previous was a character class or a back reference, we put the repeat
    3207                 :     stuff after it, but just skip the item if the repeat was {0,0}. */
    3208                 : 
    3209            1302 :     else if (*previous == OP_CLASS ||
    3210                 :              *previous == OP_NCLASS ||
    3211                 : #ifdef SUPPORT_UTF8
    3212                 :              *previous == OP_XCLASS ||
    3213                 : #endif
    3214                 :              *previous == OP_REF)
    3215                 :       {
    3216             634 :       if (repeat_max == 0)
    3217                 :         {
    3218               0 :         code = previous;
    3219               0 :         goto END_REPEAT;
    3220                 :         }
    3221                 : 
    3222                 :       /* All real repeats make it impossible to handle partial matching (maybe
    3223                 :       one day we will be able to remove this restriction). */
    3224                 : 
    3225             634 :       if (repeat_max != 1) cd->nopartial = TRUE;
    3226                 : 
    3227             668 :       if (repeat_min == 0 && repeat_max == -1)
    3228              34 :         *code++ = OP_CRSTAR + repeat_type;
    3229            1166 :       else if (repeat_min == 1 && repeat_max == -1)
    3230             566 :         *code++ = OP_CRPLUS + repeat_type;
    3231              68 :       else if (repeat_min == 0 && repeat_max == 1)
    3232              34 :         *code++ = OP_CRQUERY + repeat_type;
    3233                 :       else
    3234                 :         {
    3235               0 :         *code++ = OP_CRRANGE + repeat_type;
    3236               0 :         PUT2INC(code, 0, repeat_min);
    3237               0 :         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
    3238               0 :         PUT2INC(code, 0, repeat_max);
    3239                 :         }
    3240                 :       }
    3241                 : 
    3242                 :     /* If previous was a bracket group, we may have to replicate it in certain
    3243                 :     cases. */
    3244                 : 
    3245              68 :     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
    3246                 :              *previous == OP_ONCE || *previous == OP_COND)
    3247                 :       {
    3248                 :       register int i;
    3249              34 :       int ketoffset = 0;
    3250              34 :       int len = code - previous;
    3251              34 :       uschar *bralink = NULL;
    3252                 : 
    3253                 :       /* Repeating a DEFINE group is pointless */
    3254                 : 
    3255              34 :       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
    3256                 :         {
    3257               0 :         *errorcodeptr = ERR55;
    3258               0 :         goto FAILED;
    3259                 :         }
    3260                 : 
    3261                 :       /* This is a paranoid check to stop integer overflow later on */
    3262                 : 
    3263              34 :       if (len > MAX_DUPLENGTH)
    3264                 :         {
    3265               0 :         *errorcodeptr = ERR50;
    3266               0 :         goto FAILED;
    3267                 :         }
    3268                 : 
    3269                 :       /* If the maximum repeat count is unlimited, find the end of the bracket
    3270                 :       by scanning through from the start, and compute the offset back to it
    3271                 :       from the current code pointer. There may be an OP_OPT setting following
    3272                 :       the final KET, so we can't find the end just by going back from the code
    3273                 :       pointer. */
    3274                 : 
    3275              34 :       if (repeat_max == -1)
    3276                 :         {
    3277               0 :         register uschar *ket = previous;
    3278               0 :         do ket += GET(ket, 1); while (*ket != OP_KET);
    3279               0 :         ketoffset = code - ket;
    3280                 :         }
    3281                 : 
    3282                 :       /* The case of a zero minimum is special because of the need to stick
    3283                 :       OP_BRAZERO in front of it, and because the group appears once in the
    3284                 :       data, whereas in other cases it appears the minimum number of times. For
    3285                 :       this reason, it is simplest to treat this case separately, as otherwise
    3286                 :       the code gets far too messy. There are several special subcases when the
    3287                 :       minimum is zero. */
    3288                 : 
    3289              34 :       if (repeat_min == 0)
    3290                 :         {
    3291                 :         /* If the maximum is also zero, we just omit the group from the output
    3292                 :         altogether. */
    3293                 : 
    3294              34 :         if (repeat_max == 0)
    3295                 :           {
    3296               0 :           code = previous;
    3297               0 :           goto END_REPEAT;
    3298                 :           }
    3299                 : 
    3300                 :         /* If the maximum is 1 or unlimited, we just have to stick in the
    3301                 :         BRAZERO and do no more at this point. However, we do need to adjust
    3302                 :         any OP_RECURSE calls inside the group that refer to the group itself or
    3303                 :         any internal or forward referenced group, because the offset is from
    3304                 :         the start of the whole regex. Temporarily terminate the pattern while
    3305                 :         doing this. */
    3306                 : 
    3307              34 :         if (repeat_max <= 1)
    3308                 :           {
    3309              34 :           *code = OP_END;
    3310              34 :           adjust_recurse(previous, 1, utf8, cd, save_hwm);
    3311              34 :           memmove(previous+1, previous, len);
    3312              34 :           code++;
    3313              34 :           *previous++ = OP_BRAZERO + repeat_type;
    3314                 :           }
    3315                 : 
    3316                 :         /* If the maximum is greater than 1 and limited, we have to replicate
    3317                 :         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
    3318                 :         The first one has to be handled carefully because it's the original
    3319                 :         copy, which has to be moved up. The remainder can be handled by code
    3320                 :         that is common with the non-zero minimum case below. We have to
    3321                 :         adjust the value or repeat_max, since one less copy is required. Once
    3322                 :         again, we may have to adjust any OP_RECURSE calls inside the group. */
    3323                 : 
    3324                 :         else
    3325                 :           {
    3326                 :           int offset;
    3327               0 :           *code = OP_END;
    3328               0 :           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
    3329               0 :           memmove(previous + 2 + LINK_SIZE, previous, len);
    3330               0 :           code += 2 + LINK_SIZE;
    3331               0 :           *previous++ = OP_BRAZERO + repeat_type;
    3332               0 :           *previous++ = OP_BRA;
    3333                 : 
    3334                 :           /* We chain together the bracket offset fields that have to be
    3335                 :           filled in later when the ends of the brackets are reached. */
    3336                 : 
    3337               0 :           offset = (bralink == NULL)? 0 : previous - bralink;
    3338               0 :           bralink = previous;
    3339               0 :           PUTINC(previous, 0, offset);
    3340                 :           }
    3341                 : 
    3342              34 :         repeat_max--;
    3343                 :         }
    3344                 : 
    3345                 :       /* If the minimum is greater than zero, replicate the group as many
    3346                 :       times as necessary, and adjust the maximum to the number of subsequent
    3347                 :       copies that we need. If we set a first char from the group, and didn't
    3348                 :       set a required char, copy the latter from the former. If there are any
    3349                 :       forward reference subroutine calls in the group, there will be entries on
    3350                 :       the workspace list; replicate these with an appropriate increment. */
    3351                 : 
    3352                 :       else
    3353                 :         {
    3354               0 :         if (repeat_min > 1)
    3355                 :           {
    3356                 :           /* In the pre-compile phase, we don't actually do the replication. We
    3357                 :           just adjust the length as if we had. */
    3358                 : 
    3359               0 :           if (lengthptr != NULL)
    3360               0 :             *lengthptr += (repeat_min - 1)*length_prevgroup;
    3361                 : 
    3362                 :           /* This is compiling for real */
    3363                 : 
    3364                 :           else
    3365                 :             {
    3366               0 :             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
    3367               0 :             for (i = 1; i < repeat_min; i++)
    3368                 :               {
    3369                 :               uschar *hc;
    3370               0 :               uschar *this_hwm = cd->hwm;
    3371               0 :               memcpy(code, previous, len);
    3372               0 :               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
    3373                 :                 {
    3374               0 :                 PUT(cd->hwm, 0, GET(hc, 0) + len);
    3375               0 :                 cd->hwm += LINK_SIZE;
    3376                 :                 }
    3377               0 :               save_hwm = this_hwm;
    3378               0 :               code += len;
    3379                 :               }
    3380                 :             }
    3381                 :           }
    3382                 : 
    3383               0 :         if (repeat_max > 0) repeat_max -= repeat_min;
    3384                 :         }
    3385                 : 
    3386                 :       /* This code is common to both the zero and non-zero minimum cases. If
    3387                 :       the maximum is limited, it replicates the group in a nested fashion,
    3388                 :       remembering the bracket starts on a stack. In the case of a zero minimum,
    3389                 :       the first one was set up above. In all cases the repeat_max now specifies
    3390                 :       the number of additional copies needed. Again, we must remember to
    3391                 :       replicate entries on the forward reference list. */
    3392                 : 
    3393              34 :       if (repeat_max >= 0)
    3394                 :         {
    3395                 :         /* In the pre-compile phase, we don't actually do the replication. We
    3396                 :         just adjust the length as if we had. For each repetition we must add 1
    3397                 :         to the length for BRAZERO and for all but the last repetition we must
    3398                 :         add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
    3399                 : 
    3400              34 :         if (lengthptr != NULL && repeat_max > 0)
    3401               0 :           *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
    3402                 :             2 - 2*LINK_SIZE;  /* Last one doesn't nest */
    3403                 : 
    3404                 :         /* This is compiling for real */
    3405                 : 
    3406              34 :         else for (i = repeat_max - 1; i >= 0; i--)
    3407                 :           {
    3408                 :           uschar *hc;
    3409               0 :           uschar *this_hwm = cd->hwm;
    3410                 : 
    3411               0 :           *code++ = OP_BRAZERO + repeat_type;
    3412                 : 
    3413                 :           /* All but the final copy start a new nesting, maintaining the
    3414                 :           chain of brackets outstanding. */
    3415                 : 
    3416               0 :           if (i != 0)
    3417                 :             {
    3418                 :             int offset;
    3419               0 :             *code++ = OP_BRA;
    3420               0 :             offset = (bralink == NULL)? 0 : code - bralink;
    3421               0 :             bralink = code;
    3422               0 :             PUTINC(code, 0, offset);
    3423                 :             }
    3424                 : 
    3425               0 :           memcpy(code, previous, len);
    3426               0 :           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
    3427                 :             {
    3428               0 :             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
    3429               0 :             cd->hwm += LINK_SIZE;
    3430                 :             }
    3431               0 :           save_hwm = this_hwm;
    3432               0 :           code += len;
    3433                 :           }
    3434                 : 
    3435                 :         /* Now chain through the pending brackets, and fill in their length
    3436                 :         fields (which are holding the chain links pro tem). */
    3437                 : 
    3438              68 :         while (bralink != NULL)
    3439                 :           {
    3440                 :           int oldlinkoffset;
    3441               0 :           int offset = code - bralink + 1;
    3442               0 :           uschar *bra = code - offset;
    3443               0 :           oldlinkoffset = GET(bra, 1);
    3444               0 :           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
    3445               0 :           *code++ = OP_KET;
    3446               0 :           PUTINC(code, 0, offset);
    3447               0 :           PUT(bra, 1, offset);
    3448                 :           }
    3449                 :         }
    3450                 : 
    3451                 :       /* If the maximum is unlimited, set a repeater in the final copy. We
    3452                 :       can't just offset backwards from the current code point, because we
    3453                 :       don't know if there's been an options resetting after the ket. The
    3454                 :       correct offset was computed above.
    3455                 : 
    3456                 :       Then, when we are doing the actual compile phase, check to see whether
    3457                 :       this group is a non-atomic one that could match an empty string. If so,
    3458                 :       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
    3459                 :       that runtime checking can be done. [This check is also applied to
    3460                 :       atomic groups at runtime, but in a different way.] */
    3461                 : 
    3462                 :       else
    3463                 :         {
    3464               0 :         uschar *ketcode = code - ketoffset;
    3465               0 :         uschar *bracode = ketcode - GET(ketcode, 1);
    3466               0 :         *ketcode = OP_KETRMAX + repeat_type;
    3467               0 :         if (lengthptr == NULL && *bracode != OP_ONCE)
    3468                 :           {
    3469               0 :           uschar *scode = bracode;
    3470                 :           do
    3471                 :             {
    3472               0 :             if (could_be_empty_branch(scode, ketcode, utf8))
    3473                 :               {
    3474               0 :               *bracode += OP_SBRA - OP_BRA;
    3475               0 :               break;
    3476                 :               }
    3477               0 :             scode += GET(scode, 1);
    3478                 :             }
    3479               0 :           while (*scode == OP_ALT);
    3480                 :           }
    3481                 :         }
    3482                 :       }
    3483                 : 
    3484                 :     /* Else there's some kind of shambles */
    3485                 : 
    3486                 :     else
    3487                 :       {
    3488               0 :       *errorcodeptr = ERR11;
    3489               0 :       goto FAILED;
    3490                 :       }
    3491                 : 
    3492                 :     /* If the character following a repeat is '+', or if certain optimization
    3493                 :     tests above succeeded, possessive_quantifier is TRUE. For some of the
    3494                 :     simpler opcodes, there is an special alternative opcode for this. For
    3495                 :     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
    3496                 :     The '+' notation is just syntactic sugar, taken from Sun's Java package,
    3497                 :     but the special opcodes can optimize it a bit. The repeated item starts at
    3498                 :     tempcode, not at previous, which might be the first part of a string whose
    3499                 :     (former) last char we repeated.
    3500                 : 
    3501                 :     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
    3502                 :     an 'upto' may follow. We skip over an 'exact' item, and then test the
    3503                 :     length of what remains before proceeding. */
    3504                 : 
    3505            1264 :     if (possessive_quantifier)
    3506                 :       {
    3507                 :       int len;
    3508               8 :       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
    3509                 :           *tempcode == OP_NOTEXACT)
    3510               0 :         tempcode += _pcre_OP_lengths[*tempcode];
    3511               8 :       len = code - tempcode;
    3512               8 :       if (len > 0) switch (*tempcode)
    3513                 :         {
    3514               6 :         case OP_STAR:  *tempcode = OP_POSSTAR; break;
    3515               0 :         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
    3516               0 :         case OP_QUERY: *tempcode = OP_POSQUERY; break;
    3517               0 :         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
    3518                 : 
    3519               2 :         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
    3520               0 :         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
    3521               0 :         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
    3522               0 :         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
    3523                 : 
    3524               0 :         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
    3525               0 :         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
    3526               0 :         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
    3527               0 :         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
    3528                 : 
    3529                 :         default:
    3530               0 :         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
    3531               0 :         code += 1 + LINK_SIZE;
    3532               0 :         len += 1 + LINK_SIZE;
    3533               0 :         tempcode[0] = OP_ONCE;
    3534               0 :         *code++ = OP_KET;
    3535               0 :         PUTINC(code, 0, len);
    3536               0 :         PUT(tempcode, 1, len);
    3537                 :         break;
    3538                 :         }
    3539                 :       }
    3540                 : 
    3541                 :     /* In all case we no longer have a previous item. We also set the
    3542                 :     "follows varying string" flag for subsequently encountered reqbytes if
    3543                 :     it isn't already set and we have just passed a varying length item. */
    3544                 : 
    3545            1264 :     END_REPEAT:
    3546            1264 :     previous = NULL;
    3547            1264 :     cd->req_varyopt |= reqvary;
    3548            1264 :     break;
    3549                 : 
    3550                 : 
    3551                 :     /* ===================================================================*/
    3552                 :     /* Start of nested parenthesized sub-expression, or comment or lookahead or
    3553                 :     lookbehind or option setting or condition or all the other extended
    3554                 :     parenthesis forms. First deal with the specials; all are introduced by ?,
    3555                 :     and the appearance of any of them means that this is not a capturing
    3556                 :     group. */
    3557                 : 
    3558                 :     case '(':
    3559              38 :     newoptions = options;
    3560              38 :     skipbytes = 0;
    3561              38 :     bravalue = OP_CBRA;
    3562              38 :     save_hwm = cd->hwm;
    3563                 : 
    3564              38 :     if (*(++ptr) == '?')
    3565                 :       {
    3566                 :       int i, set, unset, namelen;
    3567                 :       int *optset;
    3568                 :       const uschar *name;
    3569                 :       uschar *slot;
    3570                 : 
    3571               0 :       switch (*(++ptr))
    3572                 :         {
    3573                 :         case '#':                 /* Comment; skip to ket */
    3574               0 :         ptr++;
    3575               0 :         while (*ptr != 0 && *ptr != ')') ptr++;
    3576               0 :         if (*ptr == 0)
    3577                 :           {
    3578               0 :           *errorcodeptr = ERR18;
    3579               0 :           goto FAILED;
    3580                 :           }
    3581               0 :         continue;
    3582                 : 
    3583                 : 
    3584                 :         /* ------------------------------------------------------------ */
    3585                 :         case ':':                 /* Non-capturing bracket */
    3586               0 :         bravalue = OP_BRA;
    3587               0 :         ptr++;
    3588               0 :         break;
    3589                 : 
    3590                 : 
    3591                 :         /* ------------------------------------------------------------ */
    3592                 :         case '(':
    3593               0 :         bravalue = OP_COND;       /* Conditional group */
    3594                 : 
    3595                 :         /* A condition can be an assertion, a number (referring to a numbered
    3596                 :         group), a name (referring to a named group), or 'R', referring to
    3597                 :         recursion. R<digits> and R&name are also permitted for recursion tests.
    3598                 : 
    3599                 :         There are several syntaxes for testing a named group: (?(name)) is used
    3600                 :         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
    3601                 : 
    3602                 :         There are two unfortunate ambiguities, caused by history. (a) 'R' can
    3603                 :         be the recursive thing or the name 'R' (and similarly for 'R' followed
    3604                 :         by digits), and (b) a number could be a name that consists of digits.
    3605                 :         In both cases, we look for a name first; if not found, we try the other
    3606                 :         cases. */
    3607                 : 
    3608                 :         /* For conditions that are assertions, check the syntax, and then exit
    3609                 :         the switch. This will take control down to where bracketed groups,
    3610                 :         including assertions, are processed. */
    3611                 : 
    3612               0 :         if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
    3613                 :           break;
    3614                 : 
    3615                 :         /* Most other conditions use OP_CREF (a couple change to OP_RREF
    3616                 :         below), and all need to skip 3 bytes at the start of the group. */
    3617                 : 
    3618               0 :         code[1+LINK_SIZE] = OP_CREF;
    3619               0 :         skipbytes = 3;
    3620                 : 
    3621                 :         /* Check for a test for recursion in a named group. */
    3622                 : 
    3623               0 :         if (ptr[1] == 'R' && ptr[2] == '&')
    3624                 :           {
    3625               0 :           terminator = -1;
    3626               0 :           ptr += 2;
    3627               0 :           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
    3628                 :           }
    3629                 : 
    3630                 :         /* Check for a test for a named group's having been set, using the Perl
    3631                 :         syntax (?(<name>) or (?('name') */
    3632                 : 
    3633               0 :         else if (ptr[1] == '<')
    3634                 :           {
    3635               0 :           terminator = '>';
    3636               0 :           ptr++;
    3637                 :           }
    3638               0 :         else if (ptr[1] == '\'')
    3639                 :           {
    3640               0 :           terminator = '\'';
    3641               0 :           ptr++;
    3642                 :           }
    3643               0 :         else terminator = 0;
    3644                 : 
    3645                 :         /* We now expect to read a name; any thing else is an error */
    3646                 : 
    3647               0 :         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
    3648                 :           {
    3649               0 :           ptr += 1;  /* To get the right offset */
    3650               0 :           *errorcodeptr = ERR28;
    3651               0 :           goto FAILED;
    3652                 :           }
    3653                 : 
    3654                 :         /* Read the name, but also get it as a number if it's all digits */
    3655                 : 
    3656               0 :         recno = 0;
    3657               0 :         name = ++ptr;
    3658               0 :         while ((cd->ctypes[*ptr] & ctype_word) != 0)
    3659                 :           {
    3660               0 :           if (recno >= 0)
    3661               0 :             recno = ((digitab[*ptr] & ctype_digit) != 0)?
    3662                 :               recno * 10 + *ptr - '0' : -1;
    3663               0 :           ptr++;
    3664                 :           }
    3665               0 :         namelen = ptr - name;
    3666                 : 
    3667               0 :         if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
    3668                 :           {
    3669               0 :           ptr--;      /* Error offset */
    3670               0 :           *errorcodeptr = ERR26;
    3671               0 :           goto FAILED;
    3672                 :           }
    3673                 : 
    3674                 :         /* Do no further checking in the pre-compile phase. */
    3675                 : 
    3676               0 :         if (lengthptr != NULL) break;
    3677                 : 
    3678                 :         /* In the real compile we do the work of looking for the actual
    3679                 :         reference. */
    3680                 : 
    3681               0 :         slot = cd->name_table;
    3682               0 :         for (i = 0; i < cd->names_found; i++)
    3683                 :           {
    3684               0 :           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
    3685               0 :           slot += cd->name_entry_size;
    3686                 :           }
    3687                 : 
    3688                 :         /* Found a previous named subpattern */
    3689                 : 
    3690               0 :         if (i < cd->names_found)
    3691                 :           {
    3692               0 :           recno = GET2(slot, 0);
    3693               0 :           PUT2(code, 2+LINK_SIZE, recno);
    3694                 :           }
    3695                 : 
    3696                 :         /* Search the pattern for a forward reference */
    3697                 : 
    3698               0 :         else if ((i = find_parens(ptr, cd->bracount, name, namelen,
    3699                 :                         (options & PCRE_EXTENDED) != 0)) > 0)
    3700                 :           {
    3701               0 :           PUT2(code, 2+LINK_SIZE, i);
    3702                 :           }
    3703                 : 
    3704                 :         /* If terminator == 0 it means that the name followed directly after
    3705                 :         the opening parenthesis [e.g. (?(abc)...] and in this case there are
    3706                 :         some further alternatives to try. For the cases where terminator != 0
    3707                 :         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
    3708                 :         now checked all the possibilities, so give an error. */
    3709                 : 
    3710               0 :         else if (terminator != 0)
    3711                 :           {
    3712               0 :           *errorcodeptr = ERR15;
    3713               0 :           goto FAILED;
    3714                 :           }
    3715                 : 
    3716                 :         /* Check for (?(R) for recursion. Allow digits after R to specify a
    3717                 :         specific group number. */
    3718                 : 
    3719               0 :         else if (*name == 'R')
    3720                 :           {
    3721               0 :           recno = 0;
    3722               0 :           for (i = 1; i < namelen; i++)
    3723                 :             {
    3724               0 :             if ((digitab[name[i]] & ctype_digit) == 0)
    3725                 :               {
    3726               0 :               *errorcodeptr = ERR15;
    3727               0 :               goto FAILED;
    3728                 :               }
    3729               0 :             recno = recno * 10 + name[i] - '0';
    3730                 :             }
    3731               0 :           if (recno == 0) recno = RREF_ANY;
    3732               0 :           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
    3733               0 :           PUT2(code, 2+LINK_SIZE, recno);
    3734                 :           }
    3735                 : 
    3736                 :         /* Similarly, check for the (?(DEFINE) "condition", which is always
    3737                 :         false. */
    3738                 : 
    3739               0 :         else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
    3740                 :           {
    3741               0 :           code[1+LINK_SIZE] = OP_DEF;
    3742               0 :           skipbytes = 1;
    3743                 :           }
    3744                 : 
    3745                 :         /* Check for the "name" actually being a subpattern number. */
    3746                 : 
    3747               0 :         else if (recno > 0)
    3748                 :           {
    3749               0 :           PUT2(code, 2+LINK_SIZE, recno);
    3750                 :           }
    3751                 : 
    3752                 :         /* Either an unidentified subpattern, or a reference to (?(0) */
    3753                 : 
    3754                 :         else
    3755                 :           {
    3756               0 :           *errorcodeptr = (recno == 0)? ERR35: ERR15;
    3757               0 :           goto FAILED;
    3758                 :           }
    3759               0 :         break;
    3760                 : 
    3761                 : 
    3762                 :         /* ------------------------------------------------------------ */
    3763                 :         case '=':                 /* Positive lookahead */
    3764               0 :         bravalue = OP_ASSERT;
    3765               0 :         ptr++;
    3766               0 :         break;
    3767                 : 
    3768                 : 
    3769                 :         /* ------------------------------------------------------------ */
    3770                 :         case '!':                 /* Negative lookahead */
    3771               0 :         bravalue = OP_ASSERT_NOT;
    3772               0 :         ptr++;
    3773               0 :         break;
    3774                 : 
    3775                 : 
    3776                 :         /* ------------------------------------------------------------ */
    3777                 :         case '<':                 /* Lookbehind or named define */
    3778               0 :         switch (ptr[1])
    3779                 :           {
    3780                 :           case '=':               /* Positive lookbehind */
    3781               0 :           bravalue = OP_ASSERTBACK;
    3782               0 :           ptr += 2;
    3783               0 :           break;
    3784                 : 
    3785                 :           case '!':               /* Negative lookbehind */
    3786               0 :           bravalue = OP_ASSERTBACK_NOT;
    3787               0 :           ptr += 2;
    3788               0 :           break;
    3789                 : 
    3790                 :           default:                /* Could be name define, else bad */
    3791               0 :           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
    3792               0 :           ptr++;                  /* Correct offset for error */
    3793               0 :           *errorcodeptr = ERR24;
    3794               0 :           goto FAILED;
    3795                 :           }
    3796               0 :         break;
    3797                 : 
    3798                 : 
    3799                 :         /* ------------------------------------------------------------ */
    3800                 :         case '>':                 /* One-time brackets */
    3801               0 :         bravalue = OP_ONCE;
    3802               0 :         ptr++;
    3803               0 :         break;
    3804                 : 
    3805                 : 
    3806                 :         /* ------------------------------------------------------------ */
    3807                 :         case 'C':                 /* Callout - may be followed by digits; */
    3808               0 :         previous_callout = code;  /* Save for later completion */
    3809               0 :         after_manual_callout = 1; /* Skip one item before completing */
    3810               0 :         *code++ = OP_CALLOUT;
    3811                 :           {
    3812               0 :           int n = 0;
    3813               0 :           while ((digitab[*(++ptr)] & ctype_digit) != 0)
    3814               0 :             n = n * 10 + *ptr - '0';
    3815               0 :           if (*ptr != ')')
    3816                 :             {
    3817               0 :             *errorcodeptr = ERR39;
    3818               0 :             goto FAILED;
    3819                 :             }
    3820               0 :           if (n > 255)
    3821                 :             {
    3822               0 :             *errorcodeptr = ERR38;
    3823               0 :             goto FAILED;
    3824                 :             }
    3825               0 :           *code++ = n;
    3826               0 :           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
    3827               0 :           PUT(code, LINK_SIZE, 0);                    /* Default length */
    3828               0 :           code += 2 * LINK_SIZE;
    3829                 :           }
    3830               0 :         previous = NULL;
    3831               0 :         continue;
    3832                 : 
    3833                 : 
    3834                 :         /* ------------------------------------------------------------ */
    3835                 :         case 'P':                 /* Python-style named subpattern handling */
    3836               0 :         if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
    3837                 :           {
    3838               0 :           is_recurse = *ptr == '>';
    3839               0 :           terminator = ')';
    3840               0 :           goto NAMED_REF_OR_RECURSE;
    3841                 :           }
    3842               0 :         else if (*ptr != '<')    /* Test for Python-style definition */
    3843                 :           {
    3844               0 :           *errorcodeptr = ERR41;
    3845               0 :           goto FAILED;
    3846                 :           }
    3847                 :         /* Fall through to handle (?P< as (?< is handled */
    3848                 : 
    3849                 : 
    3850                 :         /* ------------------------------------------------------------ */
    3851               0 :         DEFINE_NAME:    /* Come here from (?< handling */
    3852                 :         case '\'':
    3853                 :           {
    3854               0 :           terminator = (*ptr == '<')? '>' : '\'';
    3855               0 :           name = ++ptr;
    3856                 : 
    3857               0 :           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
    3858               0 :           namelen = ptr - name;
    3859                 : 
    3860                 :           /* In the pre-compile phase, just do a syntax check. */
    3861                 : 
    3862               0 :           if (lengthptr != NULL)
    3863                 :             {
    3864               0 :             if (*ptr != terminator)
    3865                 :               {
    3866               0 :               *errorcodeptr = ERR42;
    3867               0 :               goto FAILED;
    3868                 :               }
    3869               0 :             if (cd->names_found >= MAX_NAME_COUNT)
    3870                 :               {
    3871               0 :               *errorcodeptr = ERR49;
    3872               0 :               goto FAILED;
    3873                 :               }
    3874               0 :             if (namelen + 3 > cd->name_entry_size)
    3875                 :               {
    3876               0 :               cd->name_entry_size = namelen + 3;
    3877               0 :               if (namelen > MAX_NAME_SIZE)
    3878                 :                 {
    3879               0 :                 *errorcodeptr = ERR48;
    3880               0 :                 goto FAILED;
    3881                 :                 }
    3882                 :               }
    3883                 :             }
    3884                 : 
    3885                 :           /* In the real compile, create the entry in the table */
    3886                 : 
    3887                 :           else
    3888                 :             {
    3889               0 :             slot = cd->name_table;
    3890               0 :             for (i = 0; i < cd->names_found; i++)
    3891                 :               {
    3892               0 :               int crc = memcmp(name, slot+2, namelen);
    3893               0 :               if (crc == 0)
    3894                 :                 {
    3895               0 :                 if (slot[2+namelen] == 0)
    3896                 :                   {
    3897               0 :                   if ((options & PCRE_DUPNAMES) == 0)
    3898                 :                     {
    3899               0 :                     *errorcodeptr = ERR43;
    3900               0 :                     goto FAILED;
    3901                 :                     }
    3902                 :                   }
    3903               0 :                 else crc = -1;      /* Current name is substring */
    3904                 :                 }
    3905               0 :               if (crc < 0)
    3906                 :                 {
    3907               0 :                 memmove(slot + cd->name_entry_size, slot,
    3908                 :                   (cd->names_found - i) * cd->name_entry_size);
    3909               0 :                 break;
    3910                 :                 }
    3911               0 :               slot += cd->name_entry_size;
    3912                 :               }
    3913                 : 
    3914               0 :             PUT2(slot, 0, cd->bracount + 1);
    3915               0 :             memcpy(slot + 2, name, namelen);
    3916               0 :             slot[2+namelen] = 0;
    3917                 :             }
    3918                 :           }
    3919                 : 
    3920                 :         /* In both cases, count the number of names we've encountered. */
    3921                 : 
    3922               0 :         ptr++;                    /* Move past > or ' */
    3923               0 :         cd->names_found++;
    3924               0 :         goto NUMBERED_GROUP;
    3925                 : 
    3926                 : 
    3927                 :         /* ------------------------------------------------------------ */
    3928                 :         case '&':                 /* Perl recursion/subroutine syntax */
    3929               0 :         terminator = ')';
    3930               0 :         is_recurse = TRUE;
    3931                 :         /* Fall through */
    3932                 : 
    3933                 :         /* We come here from the Python syntax above that handles both
    3934                 :         references (?P=name) and recursion (?P>name), as well as falling
    3935                 :         through from the Perl recursion syntax (?&name). */
    3936                 : 
    3937               0 :         NAMED_REF_OR_RECURSE:
    3938               0 :         name = ++ptr;
    3939               0 :         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
    3940               0 :         namelen = ptr - name;
    3941                 : 
    3942                 :         /* In the pre-compile phase, do a syntax check and set a dummy
    3943                 :         reference number. */
    3944                 : 
    3945               0 :         if (lengthptr != NULL)
    3946                 :           {
    3947               0 :           if (*ptr != terminator)
    3948                 :             {
    3949               0 :             *errorcodeptr = ERR42;
    3950               0 :             goto FAILED;
    3951                 :             }
    3952               0 :           if (namelen > MAX_NAME_SIZE)
    3953                 :             {
    3954               0 :             *errorcodeptr = ERR48;
    3955               0 :             goto FAILED;
    3956                 :             }
    3957               0 :           recno = 0;
    3958                 :           }
    3959                 : 
    3960                 :         /* In the real compile, seek the name in the table */
    3961                 : 
    3962                 :         else
    3963                 :           {
    3964               0 :           slot = cd->name_table;
    3965               0 :           for (i = 0; i < cd->names_found; i++)
    3966                 :             {
    3967               0 :             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
    3968               0 :             slot += cd->name_entry_size;
    3969                 :             }
    3970                 : 
    3971               0 :           if (i < cd->names_found)         /* Back reference */
    3972                 :             {
    3973               0 :             recno = GET2(slot, 0);
    3974                 :             }
    3975               0 :           else if ((recno =                /* Forward back reference */
    3976                 :                     find_parens(ptr, cd->bracount, name, namelen,
    3977                 :                       (options & PCRE_EXTENDED) != 0)) <= 0)
    3978                 :             {
    3979               0 :             *errorcodeptr = ERR15;
    3980               0 :             goto FAILED;
    3981                 :             }
    3982                 :           }
    3983                 : 
    3984                 :         /* In both phases, we can now go to the code than handles numerical
    3985                 :         recursion or backreferences. */
    3986                 : 
    3987               0 :         if (is_recurse) goto HANDLE_RECURSION;
    3988               0 :           else goto HANDLE_REFERENCE;
    3989                 : 
    3990                 : 
    3991                 :         /* ------------------------------------------------------------ */
    3992                 :         case 'R':                 /* Recursion */
    3993               0 :         ptr++;                    /* Same as (?0)      */
    3994                 :         /* Fall through */
    3995                 : 
    3996                 : 
    3997                 :         /* ------------------------------------------------------------ */
    3998                 :         case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
    3999                 :         case '5': case '6': case '7': case '8': case '9':   /* subroutine */
    4000                 :           {
    4001                 :           const uschar *called;
    4002               0 :           recno = 0;
    4003               0 :           while((digitab[*ptr] & ctype_digit) != 0)
    4004               0 :             recno = recno * 10 + *ptr++ - '0';
    4005               0 :           if (*ptr != ')')
    4006                 :             {
    4007               0 :             *errorcodeptr = ERR29;
    4008               0 :             goto FAILED;
    4009                 :             }
    4010                 : 
    4011                 :           /* Come here from code above that handles a named recursion */
    4012                 : 
    4013               0 :           HANDLE_RECURSION:
    4014                 : 
    4015               0 :           previous = code;
    4016               0 :           called = cd->start_code;
    4017                 : 
    4018                 :           /* When we are actually compiling, find the bracket that is being
    4019                 :           referenced. Temporarily end the regex in case it doesn't exist before
    4020                 :           this point. If we end up with a forward reference, first check that
    4021                 :           the bracket does occur later so we can give the error (and position)
    4022                 :           now. Then remember this forward reference in the workspace so it can
    4023                 :           be filled in at the end. */
    4024                 : 
    4025               0 :           if (lengthptr == NULL)
    4026                 :             {
    4027               0 :             *code = OP_END;
    4028               0 :             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
    4029                 : 
    4030                 :             /* Forward reference */
    4031                 : 
    4032               0 :             if (called == NULL)
    4033                 :               {
    4034               0 :               if (find_parens(ptr, cd->bracount, NULL, recno,
    4035                 :                    (options & PCRE_EXTENDED) != 0) < 0)
    4036                 :                 {
    4037               0 :                 *errorcodeptr = ERR15;
    4038               0 :                 goto FAILED;
    4039                 :                 }
    4040               0 :               called = cd->start_code + recno;
    4041               0 :               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
    4042                 :               }
    4043                 : 
    4044                 :             /* If not a forward reference, and the subpattern is still open,
    4045                 :             this is a recursive call. We check to see if this is a left
    4046                 :             recursion that could loop for ever, and diagnose that case. */
    4047                 : 
    4048               0 :             else if (GET(called, 1) == 0 &&
    4049                 :                      could_be_empty(called, code, bcptr, utf8))
    4050                 :               {
    4051               0 :               *errorcodeptr = ERR40;
    4052               0 :               goto FAILED;
    4053                 :               }
    4054                 :             }
    4055                 : 
    4056                 :           /* Insert the recursion/subroutine item, automatically wrapped inside
    4057                 :           "once" brackets. Set up a "previous group" length so that a
    4058                 :           subsequent quantifier will work. */
    4059                 : 
    4060               0 :           *code = OP_ONCE;
    4061               0 :           PUT(code, 1, 2 + 2*LINK_SIZE);
    4062               0 :           code += 1 + LINK_SIZE;
    4063                 : 
    4064               0 :           *code = OP_RECURSE;
    4065               0 :           PUT(code, 1, called - cd->start_code);
    4066               0 :           code += 1 + LINK_SIZE;
    4067                 : 
    4068               0 :           *code = OP_KET;
    4069               0 :           PUT(code, 1, 2 + 2*LINK_SIZE);
    4070               0 :           code += 1 + LINK_SIZE;
    4071                 : 
    4072               0 :           length_prevgroup = 3 + 3*LINK_SIZE;
    4073                 :           }
    4074                 : 
    4075                 :         /* Can't determine a first byte now */
    4076                 : 
    4077               0 :         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    4078               0 :         continue;
    4079                 : 
    4080                 : 
    4081                 :         /* ------------------------------------------------------------ */
    4082                 :         default:              /* Other characters: check option setting */
    4083               0 :         set = unset = 0;
    4084               0 :         optset = &set;
    4085                 : 
    4086               0 :         while (*ptr != ')' && *ptr != ':')
    4087                 :           {
    4088               0 :           switch (*ptr++)
    4089                 :             {
    4090               0 :             case '-': optset = &unset; break;
    4091                 : 
    4092                 :             case 'J':    /* Record that it changed in the external options */
    4093               0 :             *optset |= PCRE_DUPNAMES;
    4094               0 :             cd->external_options |= PCRE_JCHANGED;
    4095               0 :             break;
    4096                 : 
    4097               0 :             case 'i': *optset |= PCRE_CASELESS; break;
    4098               0 :             case 'm': *optset |= PCRE_MULTILINE; break;
    4099               0 :             case 's': *optset |= PCRE_DOTALL; break;
    4100               0 :             case 'x': *optset |= PCRE_EXTENDED; break;
    4101               0 :             case 'U': *optset |= PCRE_UNGREEDY; break;
    4102               0 :             case 'X': *optset |= PCRE_EXTRA; break;
    4103                 : 
    4104               0 :             default:  *errorcodeptr = ERR12;
    4105               0 :                       ptr--;    /* Correct the offset */
    4106               0 :                       goto FAILED;
    4107                 :             }
    4108                 :           }
    4109                 : 
    4110                 :         /* Set up the changed option bits, but don't change anything yet. */
    4111                 : 
    4112               0 :         newoptions = (options | set) & (~unset);
    4113                 : 
    4114                 :         /* If the options ended with ')' this is not the start of a nested
    4115                 :         group with option changes, so the options change at this level. If this
    4116                 :         item is right at the start of the pattern, the options can be
    4117                 :         abstracted and made external in the pre-compile phase, and ignored in
    4118                 :         the compile phase. This can be helpful when matching -- for instance in
    4119                 :         caseless checking of required bytes.
    4120                 : 
    4121                 :         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
    4122                 :         definitely *not* at the start of the pattern because something has been
    4123                 :         compiled. In the pre-compile phase, however, the code pointer can have
    4124                 :         that value after the start, because it gets reset as code is discarded
    4125                 :         during the pre-compile. However, this can happen only at top level - if
    4126                 :         we are within parentheses, the starting BRA will still be present. At
    4127                 :         any parenthesis level, the length value can be used to test if anything
    4128                 :         has been compiled at that level. Thus, a test for both these conditions
    4129                 :         is necessary to ensure we correctly detect the start of the pattern in
    4130                 :         both phases.
    4131                 : 
    4132                 :         If we are not at the pattern start, compile code to change the ims
    4133                 :         options if this setting actually changes any of them. We also pass the
    4134                 :         new setting back so that it can be put at the start of any following
    4135                 :         branches, and when this group ends (if we are in a group), a resetting
    4136                 :         item can be compiled. */
    4137                 : 
    4138               0 :         if (*ptr == ')')
    4139                 :           {
    4140               0 :           if (code == cd->start_code + 1 + LINK_SIZE &&
    4141                 :                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
    4142                 :             {
    4143               0 :             cd->external_options = newoptions;
    4144               0 :             options = newoptions;
    4145                 :             }
    4146                 :          else
    4147                 :             {
    4148               0 :             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
    4149                 :               {
    4150               0 :               *code++ = OP_OPT;
    4151               0 :               *code++ = newoptions & PCRE_IMS;
    4152                 :               }
    4153                 : 
    4154                 :             /* Change options at this level, and pass them back for use
    4155                 :             in subsequent branches. Reset the greedy defaults and the case
    4156                 :             value for firstbyte and reqbyte. */
    4157                 : 
    4158               0 :             *optionsptr = options = newoptions;
    4159               0 :             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
    4160               0 :             greedy_non_default = greedy_default ^ 1;
    4161               0 :             req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
    4162                 :             }
    4163                 : 
    4164               0 :           previous = NULL;       /* This item can't be repeated */
    4165               0 :           continue;              /* It is complete */
    4166                 :           }
    4167                 : 
    4168                 :         /* If the options ended with ':' we are heading into a nested group
    4169                 :         with possible change of options. Such groups are non-capturing and are
    4170                 :         not assertions of any kind. All we need to do is skip over the ':';
    4171                 :         the newoptions value is handled below. */
    4172                 : 
    4173               0 :         bravalue = OP_BRA;
    4174               0 :         ptr++;
    4175                 :         }     /* End of switch for character following (? */
    4176                 :       }       /* End of (? handling */
    4177                 : 
    4178                 :     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
    4179                 :     all unadorned brackets become non-capturing and behave like (?:...)
    4180                 :     brackets. */
    4181                 : 
    4182              38 :     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
    4183                 :       {
    4184               0 :       bravalue = OP_BRA;
    4185                 :       }
    4186                 : 
    4187                 :     /* Else we have a capturing group. */
    4188                 : 
    4189                 :     else
    4190                 :       {
    4191              38 :       NUMBERED_GROUP:
    4192              38 :       cd->bracount += 1;
    4193              38 :       PUT2(code, 1+LINK_SIZE, cd->bracount);
    4194              38 :       skipbytes = 2;
    4195                 :       }
    4196                 : 
    4197                 :     /* Process nested bracketed regex. Assertions may not be repeated, but
    4198                 :     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
    4199                 :     non-register variable in order to be able to pass its address because some
    4200                 :     compilers complain otherwise. Pass in a new setting for the ims options if
    4201                 :     they have changed. */
    4202                 : 
    4203              38 :     previous = (bravalue >= OP_ONCE)? code : NULL;
    4204              38 :     *code = bravalue;
    4205              38 :     tempcode = code;
    4206              38 :     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
    4207              38 :     length_prevgroup = 0;              /* Initialize for pre-compile phase */
    4208                 : 
    4209              38 :     if (!compile_regex(
    4210                 :          newoptions,                   /* The complete new option state */
    4211                 :          options & PCRE_IMS,           /* The previous ims option state */
    4212                 :          &tempcode,                    /* Where to put code (updated) */
    4213                 :          &ptr,                         /* Input pointer (updated) */
    4214                 :          errorcodeptr,                 /* Where to put an error message */
    4215                 :          (bravalue == OP_ASSERTBACK ||
    4216                 :           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
    4217                 :          skipbytes,                    /* Skip over bracket number */
    4218                 :          &subfirstbyte,                /* For possible first char */
    4219                 :          &subreqbyte,                  /* For possible last char */
    4220                 :          bcptr,                        /* Current branch chain */
    4221                 :          cd,                           /* Tables block */
    4222                 :          (lengthptr == NULL)? NULL :   /* Actual compile phase */
    4223                 :            &length_prevgroup           /* Pre-compile phase */
    4224                 :          ))
    4225               0 :       goto FAILED;
    4226                 : 
    4227                 :     /* At the end of compiling, code is still pointing to the start of the
    4228                 :     group, while tempcode has been updated to point past the end of the group
    4229                 :     and any option resetting that may follow it. The pattern pointer (ptr)
    4230                 :     is on the bracket. */
    4231                 : 
    4232                 :     /* If this is a conditional bracket, check that there are no more than
    4233                 :     two branches in the group, or just one if it's a DEFINE group. */
    4234                 : 
    4235              38 :     if (bravalue == OP_COND)
    4236                 :       {
    4237               0 :       uschar *tc = code;
    4238               0 :       int condcount = 0;
    4239                 : 
    4240                 :       do {
    4241               0 :          condcount++;
    4242               0 :          tc += GET(tc,1);
    4243                 :          }
    4244               0 :       while (*tc != OP_KET);
    4245                 : 
    4246                 :       /* A DEFINE group is never obeyed inline (the "condition" is always
    4247                 :       false). It must have only one branch. */
    4248                 : 
    4249               0 :       if (code[LINK_SIZE+1] == OP_DEF)
    4250                 :         {
    4251               0 :         if (condcount > 1)
    4252                 :           {
    4253               0 :           *errorcodeptr = ERR54;
    4254               0 :           goto FAILED;
    4255                 :           }
    4256               0 :         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
    4257                 :         }
    4258                 : 
    4259                 :       /* A "normal" conditional group. If there is just one branch, we must not
    4260                 :       make use of its firstbyte or reqbyte, because this is equivalent to an
    4261                 :       empty second branch. */
    4262                 : 
    4263                 :       else
    4264                 :         {
    4265               0 :         if (condcount > 2)
    4266                 :           {
    4267               0 :           *errorcodeptr = ERR27;
    4268               0 :           goto FAILED;
    4269                 :           }
    4270               0 :         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
    4271                 :         }
    4272                 :       }
    4273                 : 
    4274                 :     /* Error if hit end of pattern */
    4275                 : 
    4276              38 :     if (*ptr != ')')
    4277                 :       {
    4278               0 :       *errorcodeptr = ERR14;
    4279               0 :       goto FAILED;
    4280                 :       }
    4281                 : 
    4282                 :     /* In the pre-compile phase, update the length by the length of the nested
    4283                 :     group, less the brackets at either end. Then reduce the compiled code to
    4284                 :     just the brackets so that it doesn't use much memory if it is duplicated by
    4285                 :     a quantifier. */
    4286                 : 
    4287              38 :     if (lengthptr != NULL)
    4288                 :       {
    4289              19 :       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
    4290              19 :       code++;
    4291              19 :       PUTINC(code, 0, 1 + LINK_SIZE);
    4292              19 :       *code++ = OP_KET;
    4293              19 :       PUTINC(code, 0, 1 + LINK_SIZE);
    4294                 :       }
    4295                 : 
    4296                 :     /* Otherwise update the main code pointer to the end of the group. */
    4297                 : 
    4298              19 :     else code = tempcode;
    4299                 : 
    4300                 :     /* For a DEFINE group, required and first character settings are not
    4301                 :     relevant. */
    4302                 : 
    4303              38 :     if (bravalue == OP_DEF) break;
    4304                 : 
    4305                 :     /* Handle updating of the required and first characters for other types of
    4306                 :     group. Update for normal brackets of all kinds, and conditions with two
    4307                 :     branches (see code above). If the bracket is followed by a quantifier with
    4308                 :     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
    4309                 :     zerofirstbyte outside the main loop so that they can be accessed for the
    4310                 :     back off. */
    4311                 : 
    4312              38 :     zeroreqbyte = reqbyte;
    4313              38 :     zerofirstbyte = firstbyte;
    4314              38 :     groupsetfirstbyte = FALSE;
    4315                 : 
    4316              38 :     if (bravalue >= OP_ONCE)
    4317                 :       {
    4318                 :       /* If we have not yet set a firstbyte in this branch, take it from the
    4319                 :       subpattern, remembering that it was set here so that a repeat of more
    4320                 :       than one can replicate it as reqbyte if necessary. If the subpattern has
    4321                 :       no firstbyte, set "none" for the whole branch. In both cases, a zero
    4322                 :       repeat forces firstbyte to "none". */
    4323                 : 
    4324              38 :       if (firstbyte == REQ_UNSET)
    4325                 :         {
    4326               0 :         if (subfirstbyte >= 0)
    4327                 :           {
    4328               0 :           firstbyte = subfirstbyte;
    4329               0 :           groupsetfirstbyte = TRUE;
    4330                 :           }
    4331               0 :         else firstbyte = REQ_NONE;
    4332               0 :         zerofirstbyte = REQ_NONE;
    4333                 :         }
    4334                 : 
    4335                 :       /* If firstbyte was previously set, convert the subpattern's firstbyte
    4336                 :       into reqbyte if there wasn't one, using the vary flag that was in
    4337                 :       existence beforehand. */
    4338                 : 
    4339              38 :       else if (subfirstbyte >= 0 && subreqbyte < 0)
    4340              17 :         subreqbyte = subfirstbyte | tempreqvary;
    4341                 : 
    4342                 :       /* If the subpattern set a required byte (or set a first byte that isn't
    4343                 :       really the first byte - see above), set it. */
    4344                 : 
    4345              38 :       if (subreqbyte >= 0) reqbyte = subreqbyte;
    4346                 :       }
    4347                 : 
    4348                 :     /* For a forward assertion, we take the reqbyte, if set. This can be
    4349                 :     helpful if the pattern that follows the assertion doesn't set a different
    4350                 :     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
    4351                 :     for an assertion, however because it leads to incorrect effect for patterns
    4352                 :     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
    4353                 :     of a firstbyte. This is overcome by a scan at the end if there's no
    4354                 :     firstbyte, looking for an asserted first char. */
    4355                 : 
    4356               0 :     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
    4357              38 :     break;     /* End of processing '(' */
    4358                 : 
    4359                 : 
    4360                 :     /* ===================================================================*/
    4361                 :     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
    4362                 :     are arranged to be the negation of the corresponding OP_values. For the
    4363                 :     back references, the values are ESC_REF plus the reference number. Only
    4364                 :     back references and those types that consume a character may be repeated.
    4365                 :     We can test for values between ESC_b and ESC_Z for the latter; this may
    4366                 :     have to change if any new ones are ever created. */
    4367                 : 
    4368                 :     case '\\':
    4369            7610 :     tempptr = ptr;
    4370            7610 :     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
    4371            7610 :     if (*errorcodeptr != 0) goto FAILED;
    4372                 : 
    4373            7610 :     if (c < 0)
    4374                 :       {
    4375               6 :       if (-c == ESC_Q)            /* Handle start of quoted string */
    4376                 :         {
    4377               0 :         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
    4378               0 :           else inescq = TRUE;
    4379               0 :         continue;
    4380                 :         }
    4381                 : 
    4382               6 :       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
    4383                 : 
    4384                 :       /* For metasequences that actually match a character, we disable the
    4385                 :       setting of a first character if it hasn't already been set. */
    4386                 : 
    4387               6 :       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
    4388               2 :         firstbyte = REQ_NONE;
    4389                 : 
    4390                 :       /* Set values to reset to if this is followed by a zero repeat. */
    4391                 : 
    4392               6 :       zerofirstbyte = firstbyte;
    4393               6 :       zeroreqbyte = reqbyte;
    4394                 : 
    4395                 :       /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
    4396                 : 
    4397               6 :       if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
    4398                 :         {
    4399               0 :         is_recurse = FALSE;
    4400               0 :         terminator = (*(++ptr) == '<')? '>' : '\'';
    4401               0 :         goto NAMED_REF_OR_RECURSE;
    4402                 :         }
    4403                 : 
    4404                 :       /* Back references are handled specially; must disable firstbyte if
    4405                 :       not set to cope with cases like (?=(\w+))\1: which would otherwise set
    4406                 :       ':' later. */
    4407                 : 
    4408               6 :       if (-c >= ESC_REF)
    4409                 :         {
    4410               0 :         recno = -c - ESC_REF;
    4411                 : 
    4412               0 :         HANDLE_REFERENCE:    /* Come here from named backref handling */
    4413               0 :         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    4414               0 :         previous = code;
    4415               0 :         *code++ = OP_REF;
    4416               0 :         PUT2INC(code, 0, recno);
    4417               0 :         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
    4418               0 :         if (recno > cd->top_backref) cd->top_backref = recno;
    4419                 :         }
    4420                 : 
    4421                 :       /* So are Unicode property matches, if supported. */
    4422                 : 
    4423                 : #ifdef SUPPORT_UCP
    4424               6 :       else if (-c == ESC_P || -c == ESC_p)
    4425                 :         {
    4426                 :         BOOL negated;
    4427                 :         int pdata;
    4428               0 :         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
    4429               0 :         if (ptype < 0) goto FAILED;
    4430               0 :         previous = code;
    4431               0 :         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
    4432               0 :         *code++ = ptype;
    4433               0 :         *code++ = pdata;
    4434                 :         }
    4435                 : #else
    4436                 : 
    4437                 :       /* If Unicode properties are not supported, \X, \P, and \p are not
    4438                 :       allowed. */
    4439                 : 
    4440                 :       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
    4441                 :         {
    4442                 :         *errorcodeptr = ERR45;
    4443                 :         goto FAILED;
    4444                 :         }
    4445                 : #endif
    4446                 : 
    4447                 :       /* For the rest (including \X when Unicode properties are supported), we
    4448                 :       can obtain the OP value by negating the escape value. */
    4449                 : 
    4450                 :       else
    4451                 :         {
    4452               6 :         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
    4453               6 :         *code++ = -c;
    4454                 :         }
    4455               6 :       continue;
    4456                 :       }
    4457                 : 
    4458                 :     /* We have a data character whose value is in c. In UTF-8 mode it may have
    4459                 :     a value > 127. We set its representation in the length/buffer, and then
    4460                 :     handle it as a data character. */
    4461                 : 
    4462                 : #ifdef SUPPORT_UTF8
    4463            7604 :     if (utf8 && c > 127)
    4464               0 :       mclength = _pcre_ord2utf8(c, mcbuffer);
    4465                 :     else
    4466                 : #endif
    4467                 : 
    4468                 :      {
    4469            7604 :      mcbuffer[0] = c;
    4470            7604 :      mclength = 1;
    4471                 :      }
    4472            7604 :     goto ONE_CHAR;
    4473                 : 
    4474                 : 
    4475                 :     /* ===================================================================*/
    4476                 :     /* Handle a literal character. It is guaranteed not to be whitespace or #
    4477                 :     when the extended flag is set. If we are in UTF-8 mode, it may be a
    4478                 :     multi-byte literal character. */
    4479                 : 
    4480                 :     default:
    4481           62756 :     NORMAL_CHAR:
    4482           62756 :     mclength = 1;
    4483           62756 :     mcbuffer[0] = c;
    4484                 : 
    4485                 : #ifdef SUPPORT_UTF8
    4486           62756 :     if (utf8 && c >= 0xc0)
    4487                 :       {
    4488               0 :       while ((ptr[1] & 0xc0) == 0x80)
    4489               0 :         mcbuffer[mclength++] = *(++ptr);
    4490                 :       }
    4491                 : #endif
    4492                 : 
    4493                 :     /* At this point we have the character's bytes in mcbuffer, and the length
    4494                 :     in mclength. When not in UTF-8 mode, the length is always 1. */
    4495                 : 
    4496           70360 :     ONE_CHAR:
    4497           70360 :     previous = code;
    4498           70360 :     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
    4499           70360 :     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
    4500                 : 
    4501                 :     /* Set the first and required bytes appropriately. If no previous first
    4502                 :     byte, set it from this character, but revert to none on a zero repeat.
    4503                 :     Otherwise, leave the firstbyte value alone, and don't change it on a zero
    4504                 :     repeat. */
    4505                 : 
    4506           70360 :     if (firstbyte == REQ_UNSET)
    4507                 :       {
    4508             268 :       zerofirstbyte = REQ_NONE;
    4509             268 :       zeroreqbyte = reqbyte;
    4510                 : 
    4511                 :       /* If the character is more than one byte long, we can set firstbyte
    4512                 :       only if it is not to be matched caselessly. */
    4513                 : 
    4514             536 :       if (mclength == 1 || req_caseopt == 0)
    4515                 :         {
    4516             268 :         firstbyte = mcbuffer[0] | req_caseopt;
    4517             268 :         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
    4518                 :         }
    4519               0 :       else firstbyte = reqbyte = REQ_NONE;
    4520                 :       }
    4521                 : 
    4522                 :     /* firstbyte was previously set; we can set reqbyte only the length is
    4523                 :     1 or the matching is caseful. */
    4524                 : 
    4525                 :     else
    4526                 :       {
    4527           70092 :       zerofirstbyte = firstbyte;
    4528           70092 :       zeroreqbyte = reqbyte;
    4529           70092 :       if (mclength == 1 || req_caseopt == 0)
    4530           70092 :         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
    4531                 :       }
    4532                 : 
    4533                 :     break;            /* End of literal character handling */
    4534                 :     }
    4535           73508 :   }                   /* end of big loop */
    4536                 : 
    4537                 : 
    4538                 : /* Control never reaches here by falling through, only by a goto for all the
    4539                 : error states. Pass back the position in the pattern so that it can be displayed
    4540                 : to the user for diagnosing the error. */
    4541                 : 
    4542               0 : FAILED:
    4543               0 : *ptrptr = ptr;
    4544               0 : return FALSE;
    4545                 : }
    4546                 : 
    4547                 : 
    4548                 : 
    4549                 : 
    4550                 : /*************************************************
    4551                 : *     Compile sequence of alternatives           *
    4552                 : *************************************************/
    4553                 : 
    4554                 : /* On entry, ptr is pointing past the bracket character, but on return it
    4555                 : points to the closing bracket, or vertical bar, or end of string. The code
    4556                 : variable is pointing at the byte into which the BRA operator has been stored.
    4557                 : If the ims options are changed at the start (for a (?ims: group) or during any
    4558                 : branch, we need to insert an OP_OPT item at the start of every following branch
    4559                 : to ensure they get set correctly at run time, and also pass the new options
    4560                 : into every subsequent branch compile.
    4561                 : 
    4562                 : This function is used during the pre-compile phase when we are trying to find
    4563                 : out the amount of memory needed, as well as during the real compile phase. The
    4564                 : value of lengthptr distinguishes the two phases.
    4565                 : 
    4566                 : Argument:
    4567                 :   options        option bits, including any changes for this subpattern
    4568                 :   oldims         previous settings of ims option bits
    4569                 :   codeptr        -> the address of the current code pointer
    4570                 :   ptrptr         -> the address of the current pattern pointer
    4571                 :   errorcodeptr   -> pointer to error code variable
    4572                 :   lookbehind     TRUE if this is a lookbehind assertion
    4573                 :   skipbytes      skip this many bytes at start (for brackets and OP_COND)
    4574                 :   firstbyteptr   place to put the first required character, or a negative number
    4575                 :   reqbyteptr     place to put the last required character, or a negative number
    4576                 :   bcptr          pointer to the chain of currently open branches
    4577                 :   cd             points to the data block with tables pointers etc.
    4578                 :   lengthptr      NULL during the real compile phase
    4579                 :                  points to length accumulator during pre-compile phase
    4580                 : 
    4581                 : Returns:         TRUE on success
    4582                 : */
    4583                 : 
    4584                 : static BOOL
    4585                 : compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
    4586                 :   int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
    4587                 :   int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
    4588             406 : {
    4589             406 : const uschar *ptr = *ptrptr;
    4590             406 : uschar *code = *codeptr;
    4591             406 : uschar *last_branch = code;
    4592             406 : uschar *start_bracket = code;
    4593             406 : uschar *reverse_count = NULL;
    4594                 : int firstbyte, reqbyte;
    4595                 : int branchfirstbyte, branchreqbyte;
    4596                 : int length;
    4597                 : branch_chain bc;
    4598                 : 
    4599             406 : bc.outer = bcptr;
    4600             406 : bc.current = code;
    4601                 : 
    4602             406 : firstbyte = reqbyte = REQ_UNSET;
    4603                 : 
    4604                 : /* Accumulate the length for use in the pre-compile phase. Start with the
    4605                 : length of the BRA and KET and any extra bytes that are required at the
    4606                 : beginning. We accumulate in a local variable to save frequent testing of
    4607                 : lenthptr for NULL. We cannot do this by looking at the value of code at the
    4608                 : start and end of each alternative, because compiled items are discarded during
    4609                 : the pre-compile phase so that the work space is not exceeded. */
    4610                 : 
    4611             406 : length = 2 + 2*LINK_SIZE + skipbytes;
    4612                 : 
    4613                 : /* WARNING: If the above line is changed for any reason, you must also change
    4614                 : the code that abstracts option settings at the start of the pattern and makes
    4615                 : them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
    4616                 : pre-compile phase to find out whether anything has yet been compiled or not. */
    4617                 : 
    4618                 : /* Offset is set zero to mark that this bracket is still open */
    4619                 : 
    4620             406 : PUT(code, 1, 0);
    4621             406 : code += 1 + LINK_SIZE + skipbytes;
    4622                 : 
    4623                 : /* Loop for each alternative branch */
    4624                 : 
    4625                 : for (;;)
    4626                 :   {
    4627                 :   /* Handle a change of ims options at the start of the branch */
    4628                 : 
    4629             406 :   if ((options & PCRE_IMS) != oldims)
    4630                 :     {
    4631               0 :     *code++ = OP_OPT;
    4632               0 :     *code++ = options & PCRE_IMS;
    4633               0 :     length += 2;
    4634                 :     }
    4635                 : 
    4636                 :   /* Set up dummy OP_REVERSE if lookbehind assertion */
    4637                 : 
    4638             406 :   if (lookbehind)
    4639                 :     {
    4640               0 :     *code++ = OP_REVERSE;
    4641               0 :     reverse_count = code;
    4642               0 :     PUTINC(code, 0, 0);
    4643               0 :     length += 1 + LINK_SIZE;
    4644                 :     }
    4645                 : 
    4646                 :   /* Now compile the branch; in the pre-compile phase its length gets added
    4647                 :   into the length. */
    4648                 : 
    4649             406 :   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
    4650                 :         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
    4651                 :     {
    4652               0 :     *ptrptr = ptr;
    4653               0 :     return FALSE;
    4654                 :     }
    4655                 : 
    4656                 :   /* In the real compile phase, there is some post-processing to be done. */
    4657                 : 
    4658             406 :   if (lengthptr == NULL)
    4659                 :     {
    4660                 :     /* If this is the first branch, the firstbyte and reqbyte values for the
    4661                 :     branch become the values for the regex. */
    4662                 : 
    4663             203 :     if (*last_branch != OP_ALT)
    4664                 :       {
    4665             203 :       firstbyte = branchfirstbyte;
    4666             203 :       reqbyte = branchreqbyte;
    4667                 :       }
    4668                 : 
    4669                 :     /* If this is not the first branch, the first char and reqbyte have to
    4670                 :     match the values from all the previous branches, except that if the
    4671                 :     previous value for reqbyte didn't have REQ_VARY set, it can still match,
    4672                 :     and we set REQ_VARY for the regex. */
    4673                 : 
    4674                 :     else
    4675                 :       {
    4676                 :       /* If we previously had a firstbyte, but it doesn't match the new branch,
    4677                 :       we have to abandon the firstbyte for the regex, but if there was
    4678                 :       previously no reqbyte, it takes on the value of the old firstbyte. */
    4679                 : 
    4680               0 :       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
    4681                 :         {
    4682               0 :         if (reqbyte < 0) reqbyte = firstbyte;
    4683               0 :         firstbyte = REQ_NONE;
    4684                 :         }
    4685                 : 
    4686                 :       /* If we (now or from before) have no firstbyte, a firstbyte from the
    4687                 :       branch becomes a reqbyte if there isn't a branch reqbyte. */
    4688                 : 
    4689               0 :       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
    4690               0 :           branchreqbyte = branchfirstbyte;
    4691                 : 
    4692                 :       /* Now ensure that the reqbytes match */
    4693                 : 
    4694               0 :       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
    4695               0 :         reqbyte = REQ_NONE;
    4696               0 :       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
    4697                 :       }
    4698                 : 
    4699                 :     /* If lookbehind, check that this branch matches a fixed-length string, and
    4700                 :     put the length into the OP_REVERSE item. Temporarily mark the end of the
    4701                 :     branch with OP_END. */
    4702                 : 
    4703             203 :     if (lookbehind)
    4704                 :       {
    4705                 :       int fixed_length;
    4706               0 :       *code = OP_END;
    4707               0 :       fixed_length = find_fixedlength(last_branch, options);
    4708                 :       DPRINTF(("fixed length = %d\n", fixed_length));
    4709               0 :       if (fixed_length < 0)
    4710                 :         {
    4711               0 :         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
    4712               0 :         *ptrptr = ptr;
    4713               0 :         return FALSE;
    4714                 :         }
    4715               0 :       PUT(reverse_count, 0, fixed_length);
    4716                 :       }
    4717                 :     }
    4718                 : 
    4719                 :   /* Reached end of expression, either ')' or end of pattern. Go back through
    4720                 :   the alternative branches and reverse the chain of offsets, with the field in
    4721                 :   the BRA item now becoming an offset to the first alternative. If there are
    4722                 :   no alternatives, it points to the end of the group. The length in the
    4723                 :   terminating ket is always the length of the whole bracketed item. If any of
    4724                 :   the ims options were changed inside the group, compile a resetting op-code
    4725                 :   following, except at the very end of the pattern. Return leaving the pointer
    4726                 :   at the terminating char. */
    4727                 : 
    4728             406 :   if (*ptr != '|')
    4729                 :     {
    4730             406 :     int branch_length = code - last_branch;
    4731                 :     do
    4732                 :       {
    4733             406 :       int prev_length = GET(last_branch, 1);
    4734             406 :       PUT(last_branch, 1, branch_length);
    4735             406 :       branch_length = prev_length;
    4736             406 :       last_branch -= branch_length;
    4737                 :       }
    4738             406 :     while (branch_length > 0);
    4739                 : 
    4740                 :     /* Fill in the ket */
    4741                 : 
    4742             406 :     *code = OP_KET;
    4743             406 :     PUT(code, 1, code - start_bracket);
    4744             406 :     code += 1 + LINK_SIZE;
    4745                 : 
    4746                 :     /* Resetting option if needed */
    4747                 : 
    4748             406 :     if ((options & PCRE_IMS) != oldims && *ptr == ')')
    4749                 :       {
    4750               0 :       *code++ = OP_OPT;
    4751               0 :       *code++ = oldims;
    4752               0 :       length += 2;
    4753                 :       }
    4754                 : 
    4755                 :     /* Set values to pass back */
    4756                 : 
    4757             406 :     *codeptr = code;
    4758             406 :     *ptrptr = ptr;
    4759             406 :     *firstbyteptr = firstbyte;
    4760             406 :     *reqbyteptr = reqbyte;
    4761             406 :     if (lengthptr != NULL) *lengthptr += length;
    4762             406 :     return TRUE;
    4763                 :     }
    4764                 : 
    4765                 :   /* Another branch follows; insert an "or" node. Its length field points back
    4766                 :   to the previous branch while the bracket remains open. At the end the chain
    4767                 :   is reversed. It's done like this so that the start of the bracket has a
    4768                 :   zero offset until it is closed, making it possible to detect recursion. */
    4769                 : 
    4770               0 :   *code = OP_ALT;
    4771               0 :   PUT(code, 1, code - last_branch);
    4772               0 :   bc.current = last_branch = code;
    4773               0 :   code += 1 + LINK_SIZE;
    4774               0 :   ptr++;
    4775               0 :   length += 1 + LINK_SIZE;
    4776               0 :   }
    4777                 : /* Control never reaches here */
    4778                 : }
    4779                 : 
    4780                 : 
    4781                 : 
    4782                 : 
    4783                 : /*************************************************
    4784                 : *          Check for anchored expression         *
    4785                 : *************************************************/
    4786                 : 
    4787                 : /* Try to find out if this is an anchored regular expression. Consider each
    4788                 : alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
    4789                 : all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
    4790                 : it's anchored. However, if this is a multiline pattern, then only OP_SOD
    4791                 : counts, since OP_CIRC can match in the middle.
    4792                 : 
    4793                 : We can also consider a regex to be anchored if OP_SOM starts all its branches.
    4794                 : This is the code for \G, which means "match at start of match position, taking
    4795                 : into account the match offset".
    4796                 : 
    4797                 : A branch is also implicitly anchored if it starts with .* and DOTALL is set,
    4798                 : because that will try the rest of the pattern at all possible matching points,
    4799                 : so there is no point trying again.... er ....
    4800                 : 
    4801                 : .... except when the .* appears inside capturing parentheses, and there is a
    4802                 : subsequent back reference to those parentheses. We haven't enough information
    4803                 : to catch that case precisely.
    4804                 : 
    4805                 : At first, the best we could do was to detect when .* was in capturing brackets
    4806                 : and the highest back reference was greater than or equal to that level.
    4807                 : However, by keeping a bitmap of the first 31 back references, we can catch some
    4808                 : of the more common cases more precisely.
    4809                 : 
    4810                 : Arguments:
    4811                 :   code           points to start of expression (the bracket)
    4812                 :   options        points to the options setting
    4813                 :   bracket_map    a bitmap of which brackets we are inside while testing; this
    4814                 :                   handles up to substring 31; after that we just have to take
    4815                 :                   the less precise approach
    4816                 :   backref_map    the back reference bitmap
    4817                 : 
    4818                 : Returns:     TRUE or FALSE
    4819                 : */
    4820                 : 
    4821                 : static BOOL
    4822                 : is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
    4823                 :   unsigned int backref_map)
    4824             184 : {
    4825                 : do {
    4826                 :    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
    4827             184 :      options, PCRE_MULTILINE, FALSE);
    4828             184 :    register int op = *scode;
    4829                 : 
    4830                 :    /* Non-capturing brackets */
    4831                 : 
    4832             184 :    if (op == OP_BRA)
    4833                 :      {
    4834               0 :      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
    4835                 :      }
    4836                 : 
    4837                 :    /* Capturing brackets */
    4838                 : 
    4839             184 :    else if (op == OP_CBRA)
    4840                 :      {
    4841               0 :      int n = GET2(scode, 1+LINK_SIZE);
    4842               0 :      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
    4843               0 :      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
    4844                 :      }
    4845                 : 
    4846                 :    /* Other brackets */
    4847                 : 
    4848             184 :    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
    4849                 :      {
    4850               0 :      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
    4851                 :      }
    4852                 : 
    4853                 :    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
    4854                 :    are or may be referenced. */
    4855                 : 
    4856             184 :    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
    4857                 :              op == OP_TYPEPOSSTAR) &&
    4858                 :             (*options & PCRE_DOTALL) != 0)
    4859                 :      {
    4860               0 :      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
    4861                 :      }
    4862                 : 
    4863                 :    /* Check for explicit anchoring */
    4864                 : 
    4865             184 :    else if (op != OP_SOD && op != OP_SOM &&
    4866                 :            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
    4867               2 :      return FALSE;
    4868             182 :    code += GET(code, 1);
    4869                 :    }
    4870             182 : while (*code == OP_ALT);   /* Loop for each alternative */
    4871             182 : return TRUE;
    4872                 : }
    4873                 : 
    4874                 : 
    4875                 : 
    4876                 : /*************************************************
    4877                 : *         Check for starting with ^ or .*        *
    4878                 : *************************************************/
    4879                 : 
    4880                 : /* This is called to find out if every branch starts with ^ or .* so that
    4881                 : "first char" processing can be done to speed things up in multiline
    4882                 : matching and for non-DOTALL patterns that start with .* (which must start at
    4883                 : the beginning or after \n). As in the case of is_anchored() (see above), we
    4884                 : have to take account of back references to capturing brackets that contain .*
    4885                 : because in that case we can't make the assumption.
    4886                 : 
    4887                 : Arguments:
    4888                 :   code           points to start of expression (the bracket)
    4889                 :   bracket_map    a bitmap of which brackets we are inside while testing; this
    4890                 :                   handles up to substring 31; after that we just have to take
    4891                 :                   the less precise approach
    4892                 :   backref_map    the back reference bitmap
    4893                 : 
    4894                 : Returns:         TRUE or FALSE
    4895                 : */
    4896                 : 
    4897                 : static BOOL
    4898                 : is_startline(const uschar *code, unsigned int bracket_map,
    4899                 :   unsigned int backref_map)
    4900               1 : {
    4901                 : do {
    4902                 :    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
    4903               1 :      NULL, 0, FALSE);
    4904               1 :    register int op = *scode;
    4905                 : 
    4906                 :    /* Non-capturing brackets */
    4907                 : 
    4908               1 :    if (op == OP_BRA)
    4909                 :      {
    4910               0 :      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
    4911                 :      }
    4912                 : 
    4913                 :    /* Capturing brackets */
    4914                 : 
    4915               1 :    else if (op == OP_CBRA)
    4916                 :      {
    4917               0 :      int n = GET2(scode, 1+LINK_SIZE);
    4918               0 :      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
    4919               0 :      if (!is_startline(scode, new_map, backref_map)) return FALSE;
    4920                 :      }
    4921                 : 
    4922                 :    /* Other brackets */
    4923                 : 
    4924               1 :    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
    4925               0 :      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
    4926                 : 
    4927                 :    /* .* means "start at start or after \n" if it isn't in brackets that
    4928                 :    may be referenced. */
    4929                 : 
    4930               1 :    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
    4931                 :      {
    4932               0 :      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
    4933                 :      }
    4934                 : 
    4935                 :    /* Check for explicit circumflex */
    4936                 : 
    4937               1 :    else if (op != OP_CIRC) return FALSE;
    4938                 : 
    4939                 :    /* Move on to the next alternative */
    4940                 : 
    4941               0 :    code += GET(code, 1);
    4942                 :    }
    4943               0 : while (*code == OP_ALT);  /* Loop for each alternative */
    4944               0 : return TRUE;
    4945                 : }
    4946                 : 
    4947                 : 
    4948                 : 
    4949                 : /*************************************************
    4950                 : *       Check for asserted fixed first char      *
    4951                 : *************************************************/
    4952                 : 
    4953                 : /* During compilation, the "first char" settings from forward assertions are
    4954                 : discarded, because they can cause conflicts with actual literals that follow.
    4955                 : However, if we end up without a first char setting for an unanchored pattern,
    4956                 : it is worth scanning the regex to see if there is an initial asserted first
    4957                 : char. If all branches start with the same asserted char, or with a bracket all
    4958                 : of whose alternatives start with the same asserted char (recurse ad lib), then
    4959                 : we return that char, otherwise -1.
    4960                 : 
    4961                 : Arguments:
    4962                 :   code       points to start of expression (the bracket)
    4963                 :   options    pointer to the options (used to check casing changes)
    4964                 :   inassert   TRUE if in an assertion
    4965                 : 
    4966                 : Returns:     -1 or the fixed first char
    4967                 : */
    4968                 : 
    4969                 : static int
    4970                 : find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
    4971               1 : {
    4972               1 : register int c = -1;
    4973                 : do {
    4974                 :    int d;
    4975                 :    const uschar *scode =
    4976               1 :      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
    4977               1 :    register int op = *scode;
    4978                 : 
    4979               1 :    switch(op)
    4980                 :      {
    4981                 :      default:
    4982               1 :      return -1;
    4983                 : 
    4984                 :      case OP_BRA:
    4985                 :      case OP_CBRA:
    4986                 :      case OP_ASSERT:
    4987                 :      case OP_ONCE:
    4988                 :      case OP_COND:
    4989               0 :      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
    4990               0 :        return -1;
    4991               0 :      if (c < 0) c = d; else if (c != d) return -1;
    4992               0 :      break;
    4993                 : 
    4994                 :      case OP_EXACT:       /* Fall through */
    4995               0 :      scode += 2;
    4996                 : 
    4997                 :      case OP_CHAR:
    4998                 :      case OP_CHARNC:
    4999                 :      case OP_PLUS:
    5000                 :      case OP_MINPLUS:
    5001                 :      case OP_POSPLUS:
    5002               0 :      if (!inassert) return -1;
    5003               0 :      if (c < 0)
    5004                 :        {
    5005               0 :        c = scode[1];
    5006               0 :        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
    5007                 :        }
    5008               0 :      else if (c != scode[1]) return -1;
    5009                 :      break;
    5010                 :      }
    5011                 : 
    5012               0 :    code += GET(code, 1);
    5013                 :    }
    5014               0 : while (*code == OP_ALT);
    5015               0 : return c;
    5016                 : }
    5017                 : 
    5018                 : 
    5019                 : 
    5020                 : /*************************************************
    5021                 : *        Compile a Regular Expression            *
    5022                 : *************************************************/
    5023                 : 
    5024                 : /* This function takes a string and returns a pointer to a block of store
    5025                 : holding a compiled version of the expression. The original API for this
    5026                 : function had no error code return variable; it is retained for backwards
    5027                 : compatibility. The new function is given a new name.
    5028                 : 
    5029                 : Arguments:
    5030                 :   pattern       the regular expression
    5031                 :   options       various option bits
    5032                 :   errorcodeptr  pointer to error code variable (pcre_compile2() only)
    5033                 :                   can be NULL if you don't want a code value
    5034                 :   errorptr      pointer to pointer to error text
    5035                 :   erroroffset   ptr offset in pattern where error was detected
    5036                 :   tables        pointer to character tables or NULL
    5037                 : 
    5038                 : Returns:        pointer to compiled data block, or NULL on error,
    5039                 :                 with errorptr and erroroffset set
    5040                 : */
    5041                 : 
    5042                 : PCRE_DATA_SCOPE pcre *
    5043                 : pcre_compile(const char *pattern, int options, const char **errorptr,
    5044                 :   int *erroroffset, const unsigned char *tables)
    5045             184 : {
    5046             184 : return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
    5047                 : }
    5048                 : 
    5049                 : 
    5050                 : PCRE_DATA_SCOPE pcre *
    5051                 : pcre_compile2(const char *pattern, int options, int *errorcodeptr,
    5052                 :   const char **errorptr, int *erroroffset, const unsigned char *tables)
    5053             184 : {
    5054                 : real_pcre *re;
    5055             184 : int length = 1;  /* For final END opcode */
    5056                 : int firstbyte, reqbyte, newline;
    5057             184 : int errorcode = 0;
    5058                 : #ifdef SUPPORT_UTF8
    5059                 : BOOL utf8;
    5060                 : #endif
    5061                 : size_t size;
    5062                 : uschar *code;
    5063                 : const uschar *codestart;
    5064                 : const uschar *ptr;
    5065                 : compile_data compile_block;
    5066             184 : compile_data *cd = &compile_block;
    5067                 : 
    5068                 : /* This space is used for "compiling" into during the first phase, when we are
    5069                 : computing the amount of memory that is needed. Compiled items are thrown away
    5070                 : as soon as possible, so that a fairly large buffer should be sufficient for
    5071                 : this purpose. The same space is used in the second phase for remembering where
    5072                 : to fill in forward references to subpatterns. */
    5073                 : 
    5074                 : uschar cworkspace[COMPILE_WORK_SIZE];
    5075                 : 
    5076                 : 
    5077                 : /* Set this early so that early errors get offset 0. */
    5078                 : 
    5079             184 : ptr = (const uschar *)pattern;
    5080                 : 
    5081                 : /* We can't pass back an error message if errorptr is NULL; I guess the best we
    5082                 : can do is just return NULL, but we can set a code value if there is a code
    5083                 : pointer. */
    5084                 : 
    5085             184 : if (errorptr == NULL)
    5086                 :   {
    5087               0 :   if (errorcodeptr != NULL) *errorcodeptr = 99;
    5088               0 :   return NULL;
    5089                 :   }
    5090                 : 
    5091             184 : *errorptr = NULL;
    5092             184 : if (errorcodeptr != NULL) *errorcodeptr = ERR0;
    5093                 : 
    5094                 : /* However, we can give a message for this error */
    5095                 : 
    5096             184 : if (erroroffset == NULL)
    5097                 :   {
    5098               0 :   errorcode = ERR16;
    5099               0 :   goto PCRE_EARLY_ERROR_RETURN;
    5100                 :   }
    5101                 : 
    5102             184 : *erroroffset = 0;
    5103                 : 
    5104                 : /* Can't support UTF8 unless PCRE has been compiled to include the code. */
    5105                 : 
    5106                 : #ifdef SUPPORT_UTF8
    5107             184 : utf8 = (options & PCRE_UTF8) != 0;
    5108             184 : if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
    5109                 :      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
    5110                 :   {
    5111               0 :   errorcode = ERR44;
    5112               0 :   goto PCRE_UTF8_ERROR_RETURN;
    5113                 :   }
    5114                 : #else
    5115                 : if ((options & PCRE_UTF8) != 0)
    5116                 :   {
    5117                 :   errorcode = ERR32;
    5118                 :   goto PCRE_EARLY_ERROR_RETURN;
    5119                 :   }
    5120                 : #endif
    5121                 : 
    5122             184 : if ((options & ~PUBLIC_OPTIONS) != 0)
    5123                 :   {
    5124               0 :   errorcode = ERR17;
    5125               0 :   goto PCRE_EARLY_ERROR_RETURN;
    5126                 :   }
    5127                 : 
    5128                 : /* Set up pointers to the individual character tables */
    5129                 : 
    5130             184 : if (tables == NULL) tables = _pcre_default_tables;
    5131             184 : cd->lcc = tables + lcc_offset;
    5132             184 : cd->fcc = tables + fcc_offset;
    5133             184 : cd->cbits = tables + cbits_offset;
    5134             184 : cd->ctypes = tables + ctypes_offset;
    5135                 : 
    5136                 : /* Handle different types of newline. The three bits give seven cases. The
    5137                 : current code allows for fixed one- or two-byte sequences, plus "any". */
    5138                 : 
    5139             184 : switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
    5140                 :   {
    5141             184 :   case 0: newline = NEWLINE; break;   /* Compile-time default */
    5142               0 :   case PCRE_NEWLINE_CR: newline = '\r'; break;
    5143               0 :   case PCRE_NEWLINE_LF: newline = '\n'; break;
    5144                 :   case PCRE_NEWLINE_CR+
    5145               0 :        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
    5146               0 :   case PCRE_NEWLINE_ANY: newline = -1; break;
    5147               0 :   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
    5148                 :   }
    5149                 : 
    5150             184 : if (newline < 0)
    5151                 :   {
    5152               0 :   cd->nltype = NLTYPE_ANY;
    5153                 :   }
    5154                 : else
    5155                 :   {
    5156             184 :   cd->nltype = NLTYPE_FIXED;
    5157             184 :   if (newline > 255)
    5158                 :     {
    5159               0 :     cd->nllen = 2;
    5160               0 :     cd->nl[0] = (newline >> 8) & 255;
    5161               0 :     cd->nl[1] = newline & 255;
    5162                 :     }
    5163                 :   else
    5164                 :     {
    5165             184 :     cd->nllen = 1;
    5166             184 :     cd->nl[0] = newline;
    5167                 :     }
    5168                 :   }
    5169                 : 
    5170                 : /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
    5171                 : references to help in deciding whether (.*) can be treated as anchored or not.
    5172                 : */
    5173                 : 
    5174             184 : cd->top_backref = 0;
    5175             184 : cd->backref_map = 0;
    5176                 : 
    5177                 : /* Reflect pattern for debugging output */
    5178                 : 
    5179                 : DPRINTF(("------------------------------------------------------------------\n"));
    5180                 : DPRINTF(("%s\n", pattern));
    5181                 : 
    5182                 : /* Pretend to compile the pattern while actually just accumulating the length
    5183                 : of memory required. This behaviour is triggered by passing a non-NULL final
    5184                 : argument to compile_regex(). We pass a block of workspace (cworkspace) for it
    5185                 : to compile parts of the pattern into; the compiled code is discarded when it is
    5186                 : no longer needed, so hopefully this workspace will never overflow, though there
    5187                 : is a test for its doing so. */
    5188                 : 
    5189             184 : cd->bracount = 0;
    5190             184 : cd->names_found = 0;
    5191             184 : cd->name_entry_size = 0;
    5192             184 : cd->name_table = NULL;
    5193             184 : cd->start_workspace = cworkspace;
    5194             184 : cd->start_code = cworkspace;
    5195             184 : cd->hwm = cworkspace;
    5196             184 : cd->start_pattern = (const uschar *)pattern;
    5197             184 : cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
    5198             184 : cd->req_varyopt = 0;
    5199             184 : cd->nopartial = FALSE;
    5200             184 : cd->external_options = options;
    5201                 : 
    5202                 : /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
    5203                 : don't need to look at the result of the function here. The initial options have
    5204                 : been put into the cd block so that they can be changed if an option setting is
    5205                 : found within the regex right at the beginning. Bringing initial option settings
    5206                 : outside can help speed up starting point checks. */
    5207                 : 
    5208             184 : code = cworkspace;
    5209             184 : *code = OP_BRA;
    5210             184 : (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
    5211                 :   &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
    5212             184 : if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
    5213                 : 
    5214                 : DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
    5215                 :   cd->hwm - cworkspace));
    5216                 : 
    5217             184 : if (length > MAX_PATTERN_SIZE)
    5218                 :   {
    5219               0 :   errorcode = ERR20;
    5220               0 :   goto PCRE_EARLY_ERROR_RETURN;
    5221                 :   }
    5222                 : 
    5223                 : /* Compute the size of data block needed and get it, either from malloc or
    5224                 : externally provided function. Integer overflow should no longer be possible
    5225                 : because nowadays we limit the maximum value of cd->names_found and
    5226                 : cd->name_entry_size. */
    5227                 : 
    5228             184 : size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
    5229             184 : re = (real_pcre *)(pcre_malloc)(size);
    5230                 : 
    5231             184 : if (re == NULL)
    5232                 :   {
    5233               0 :   errorcode = ERR21;
    5234               0 :   goto PCRE_EARLY_ERROR_RETURN;
    5235                 :   }
    5236                 : 
    5237                 : /* Put in the magic number, and save the sizes, initial options, and character
    5238                 : table pointer. NULL is used for the default character tables. The nullpad field
    5239                 : is at the end; it's there to help in the case when a regex compiled on a system
    5240                 : with 4-byte pointers is run on another with 8-byte pointers. */
    5241                 : 
    5242             184 : re->magic_number = MAGIC_NUMBER;
    5243             184 : re->size = size;
    5244             184 : re->options = cd->external_options;
    5245             184 : re->dummy1 = 0;
    5246             184 : re->first_byte = 0;
    5247             184 : re->req_byte = 0;
    5248             184 : re->name_table_offset = sizeof(real_pcre);
    5249             184 : re->name_entry_size = cd->name_entry_size;
    5250             184 : re->name_count = cd->names_found;
    5251             184 : re->ref_count = 0;
    5252             184 : re->tables = (tables == _pcre_default_tables)? NULL : tables;
    5253             184 : re->nullpad = NULL;
    5254                 : 
    5255                 : /* The starting points of the name/number translation table and of the code are
    5256                 : passed around in the compile data block. The start/end pattern and initial
    5257                 : options are already set from the pre-compile phase, as is the name_entry_size
    5258                 : field. Reset the bracket count and the names_found field. Also reset the hwm
    5259                 : field; this time it's used for remembering forward references to subpatterns.
    5260                 : */
    5261                 : 
    5262             184 : cd->bracount = 0;
    5263             184 : cd->names_found = 0;
    5264             184 : cd->name_table = (uschar *)re + re->name_table_offset;
    5265             184 : codestart = cd->name_table + re->name_entry_size * re->name_count;
    5266             184 : cd->start_code = codestart;
    5267             184 : cd->hwm = cworkspace;
    5268             184 : cd->req_varyopt = 0;
    5269             184 : cd->nopartial = FALSE;
    5270                 : 
    5271                 : /* Set up a starting, non-extracting bracket, then compile the expression. On
    5272                 : error, errorcode will be set non-zero, so we don't need to look at the result
    5273                 : of the function here. */
    5274                 : 
    5275             184 : ptr = (const uschar *)pattern;
    5276             184 : code = (uschar *)codestart;
    5277             184 : *code = OP_BRA;
    5278             184 : (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
    5279                 :   &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
    5280             184 : re->top_bracket = cd->bracount;
    5281             184 : re->top_backref = cd->top_backref;
    5282                 : 
    5283             184 : if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
    5284                 : 
    5285                 : /* If not reached end of pattern on success, there's an excess bracket. */
    5286                 : 
    5287             184 : if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
    5288                 : 
    5289                 : /* Fill in the terminating state and check for disastrous overflow, but
    5290                 : if debugging, leave the test till after things are printed out. */
    5291                 : 
    5292             184 : *code++ = OP_END;
    5293                 : 
    5294                 : #ifndef DEBUG
    5295             184 : if (code - codestart > length) errorcode = ERR23;
    5296                 : #endif
    5297                 : 
    5298                 : /* Fill in any forward references that are required. */
    5299                 : 
    5300             368 : while (errorcode == 0 && cd->hwm > cworkspace)
    5301                 :   {
    5302                 :   int offset, recno;
    5303                 :   const uschar *groupptr;
    5304               0 :   cd->hwm -= LINK_SIZE;
    5305               0 :   offset = GET(cd->hwm, 0);
    5306               0 :   recno = GET(codestart, offset);
    5307               0 :   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
    5308               0 :   if (groupptr == NULL) errorcode = ERR53;
    5309               0 :     else PUT(((uschar *)codestart), offset, groupptr - codestart);
    5310                 :   }
    5311                 : 
    5312                 : /* Give an error if there's back reference to a non-existent capturing
    5313                 : subpattern. */
    5314                 : 
    5315             184 : if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
    5316                 : 
    5317                 : /* Failed to compile, or error while post-processing */
    5318                 : 
    5319             184 : if (errorcode != 0)
    5320                 :   {
    5321               0 :   (pcre_free)(re);
    5322               0 :   PCRE_EARLY_ERROR_RETURN:
    5323               0 :   *erroroffset = ptr - (const uschar *)pattern;
    5324                 : #ifdef SUPPORT_UTF8
    5325               0 :   PCRE_UTF8_ERROR_RETURN:
    5326                 : #endif
    5327               0 :   *errorptr = error_texts[errorcode];
    5328               0 :   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
    5329               0 :   return NULL;
    5330                 :   }
    5331                 : 
    5332                 : /* If the anchored option was not passed, set the flag if we can determine that
    5333                 : the pattern is anchored by virtue of ^ characters or \A or anything else (such
    5334                 : as starting with .* when DOTALL is set).
    5335                 : 
    5336                 : Otherwise, if we know what the first byte has to be, save it, because that
    5337                 : speeds up unanchored matches no end. If not, see if we can set the
    5338                 : PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
    5339                 : start with ^. and also when all branches start with .* for non-DOTALL matches.
    5340                 : */
    5341                 : 
    5342             184 : if ((re->options & PCRE_ANCHORED) == 0)
    5343                 :   {
    5344             184 :   int temp_options = re->options;   /* May get changed during these scans */
    5345             184 :   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
    5346             182 :     re->options |= PCRE_ANCHORED;
    5347                 :   else
    5348                 :     {
    5349               2 :     if (firstbyte < 0)
    5350               1 :       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
    5351               2 :     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
    5352                 :       {
    5353               1 :       int ch = firstbyte & 255;
    5354               1 :       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
    5355                 :          cd->fcc[ch] == ch)? ch : firstbyte;
    5356               1 :       re->options |= PCRE_FIRSTSET;
    5357                 :       }
    5358               1 :     else if (is_startline(codestart, 0, cd->backref_map))
    5359               0 :       re->options |= PCRE_STARTLINE;
    5360                 :     }
    5361                 :   }
    5362                 : 
    5363                 : /* For an anchored pattern, we use the "required byte" only if it follows a
    5364                 : variable length item in the regex. Remove the caseless flag for non-caseable
    5365                 : bytes. */
    5366                 : 
    5367             184 : if (reqbyte >= 0 &&
    5368                 :      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
    5369                 :   {
    5370             107 :   int ch = reqbyte & 255;
    5371             107 :   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
    5372                 :     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
    5373             107 :   re->options |= PCRE_REQCHSET;
    5374                 :   }
    5375                 : 
    5376                 : /* Print out the compiled data if debugging is enabled. This is never the
    5377                 : case when building a production library. */
    5378                 : 
    5379                 : #ifdef DEBUG
    5380                 : 
    5381                 : printf("Length = %d top_bracket = %d top_backref = %d\n",
    5382                 :   length, re->top_bracket, re->top_backref);
    5383                 : 
    5384                 : if (re->options != 0)
    5385                 :   {
    5386                 :   printf("%s%s%s%s%s%s%s%s%s\n",
    5387                 :     ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
    5388                 :     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
    5389                 :     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
    5390                 :     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
    5391                 :     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
    5392                 :     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
    5393                 :     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
    5394                 :     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
    5395                 :     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
    5396                 :   }
    5397                 : 
    5398                 : if ((re->options & PCRE_FIRSTSET) != 0)
    5399                 :   {
    5400                 :   int ch = re->first_byte & 255;
    5401                 :   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
    5402                 :     "" : " (caseless)";
    5403                 :   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
    5404                 :     else printf("First char = \\x%02x%s\n", ch, caseless);
    5405                 :   }
    5406                 : 
    5407                 : if ((re->options & PCRE_REQCHSET) != 0)
    5408                 :   {
    5409                 :   int ch = re->req_byte & 255;
    5410                 :   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
    5411                 :     "" : " (caseless)";
    5412                 :   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
    5413                 :     else printf("Req char = \\x%02x%s\n", ch, caseless);
    5414                 :   }
    5415                 : 
    5416                 : pcre_printint(re, stdout);
    5417                 : 
    5418                 : /* This check is done here in the debugging case so that the code that
    5419                 : was compiled can be seen. */
    5420                 : 
    5421                 : if (code - codestart > length)
    5422                 :   {
    5423                 :   (pcre_free)(re);
    5424                 :   *errorptr = error_texts[ERR23];
    5425                 :   *erroroffset = ptr - (uschar *)pattern;
    5426                 :   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
    5427                 :   return NULL;
    5428                 :   }
    5429                 : #endif   /* DEBUG */
    5430                 : 
    5431             184 : return (pcre *)re;
    5432                 : }
    5433                 : 
    5434                 : /* End of pcre_compile.c */
Generated by: LTP GCOV extension version 1.5