1 : /*************************************************
2 : * Perl-Compatible Regular Expressions *
3 : *************************************************/
4 :
5 : /* PCRE is a library of functions to support regular expressions whose syntax
6 : and semantics are as close as possible to those of the Perl 5 language.
7 :
8 : Written by Philip Hazel
9 : Copyright (c) 1997-2006 University of Cambridge
10 :
11 : -----------------------------------------------------------------------------
12 : Redistribution and use in source and binary forms, with or without
13 : modification, are permitted provided that the following conditions are met:
14 :
15 : * Redistributions of source code must retain the above copyright notice,
16 : this list of conditions and the following disclaimer.
17 :
18 : * Redistributions in binary form must reproduce the above copyright
19 : notice, this list of conditions and the following disclaimer in the
20 : documentation and/or other materials provided with the distribution.
21 :
22 : * Neither the name of the University of Cambridge nor the names of its
23 : contributors may be used to endorse or promote products derived from
24 : this software without specific prior written permission.
25 :
26 : THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 : AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 : IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 : ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 : LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 : CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 : SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 : INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 : CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 : ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 : POSSIBILITY OF SUCH DAMAGE.
37 : -----------------------------------------------------------------------------
38 : */
39 :
40 :
41 : /* This module contains code for searching the table of Unicode character
42 : properties. */
43 :
44 : #include "pcre_internal.h"
45 :
46 : #include "ucp.h" /* Category definitions */
47 : #include "ucpinternal.h" /* Internal table details */
48 : #include "ucptable.c" /* The table itself */
49 :
50 :
51 : /* Table to translate from particular type value to the general value. */
52 :
53 : static int ucp_gentype[] = {
54 : ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
55 : ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
56 : ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
57 : ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */
58 : ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */
59 : ucp_P, ucp_P, /* Ps, Po */
60 : ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */
61 : ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
62 : };
63 :
64 :
65 :
66 : /*************************************************
67 : * Search table and return type *
68 : *************************************************/
69 :
70 : /* Three values are returned: the category is ucp_C, ucp_L, etc. The detailed
71 : character type is ucp_Lu, ucp_Nd, etc. The script is ucp_Latin, etc.
72 :
73 : Arguments:
74 : c the character value
75 : type_ptr the detailed character type is returned here
76 : script_ptr the script is returned here
77 :
78 : Returns: the character type category
79 : */
80 :
81 : int
82 : _pcre_ucp_findprop(const unsigned int c, int *type_ptr, int *script_ptr)
83 0 : {
84 0 : int bot = 0;
85 0 : int top = sizeof(ucp_table)/sizeof(cnode);
86 : int mid;
87 :
88 : /* The table is searched using a binary chop. You might think that using
89 : intermediate variables to hold some of the common expressions would speed
90 : things up, but tests with gcc 3.4.4 on Linux showed that, on the contrary, it
91 : makes things a lot slower. */
92 :
93 : for (;;)
94 : {
95 0 : if (top <= bot)
96 : {
97 0 : *type_ptr = ucp_Cn;
98 0 : *script_ptr = ucp_Common;
99 0 : return ucp_C;
100 : }
101 0 : mid = (bot + top) >> 1;
102 0 : if (c == (ucp_table[mid].f0 & f0_charmask)) break;
103 0 : if (c < (ucp_table[mid].f0 & f0_charmask)) top = mid;
104 : else
105 : {
106 0 : if ((ucp_table[mid].f0 & f0_rangeflag) != 0 &&
107 : c <= (ucp_table[mid].f0 & f0_charmask) +
108 0 : (ucp_table[mid].f1 & f1_rangemask)) break;
109 0 : bot = mid + 1;
110 : }
111 0 : }
112 :
113 : /* Found an entry in the table. Set the script and detailed type values, and
114 : return the general type. */
115 :
116 0 : *script_ptr = (ucp_table[mid].f0 & f0_scriptmask) >> f0_scriptshift;
117 0 : *type_ptr = (ucp_table[mid].f1 & f1_typemask) >> f1_typeshift;
118 :
119 0 : return ucp_gentype[*type_ptr];
120 : }
121 :
122 :
123 :
124 : /*************************************************
125 : * Search table and return other case *
126 : *************************************************/
127 :
128 : /* If the given character is a letter, and there is another case for the
129 : letter, return the other case. Otherwise, return -1.
130 :
131 : Arguments:
132 : c the character value
133 :
134 : Returns: the other case or NOTACHAR if none
135 : */
136 :
137 : unsigned int
138 : _pcre_ucp_othercase(const unsigned int c)
139 0 : {
140 0 : int bot = 0;
141 0 : int top = sizeof(ucp_table)/sizeof(cnode);
142 : int mid, offset;
143 :
144 : /* The table is searched using a binary chop. You might think that using
145 : intermediate variables to hold some of the common expressions would speed
146 : things up, but tests with gcc 3.4.4 on Linux showed that, on the contrary, it
147 : makes things a lot slower. */
148 :
149 : for (;;)
150 : {
151 0 : if (top <= bot) return -1;
152 0 : mid = (bot + top) >> 1;
153 0 : if (c == (ucp_table[mid].f0 & f0_charmask)) break;
154 0 : if (c < (ucp_table[mid].f0 & f0_charmask)) top = mid;
155 : else
156 : {
157 0 : if ((ucp_table[mid].f0 & f0_rangeflag) != 0 &&
158 : c <= (ucp_table[mid].f0 & f0_charmask) +
159 0 : (ucp_table[mid].f1 & f1_rangemask)) break;
160 0 : bot = mid + 1;
161 : }
162 0 : }
163 :
164 : /* Found an entry in the table. Return NOTACHAR for a range entry. Otherwise
165 : return the other case if there is one, else NOTACHAR. */
166 :
167 0 : if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return NOTACHAR;
168 :
169 0 : offset = ucp_table[mid].f1 & f1_casemask;
170 0 : if ((offset & f1_caseneg) != 0) offset |= f1_caseneg;
171 0 : return (offset == 0)? NOTACHAR : c + offset;
172 : }
173 :
174 :
175 : /* End of pcre_ucp_searchfuncs.c */
|