LCOV - PHP Code Coverage - ext/standard/metaphone.c

LTP GCOV extension - code coverage report
Current view:	directory - ext/standard - metaphone.c
Test:	PHP Code Coverage
Date:	2007-04-10	Instrumented lines:	140
Code covered:	0.0 %	Executed lines:	0
Legend:	not executed executed
       1                 : /*
       2                 :    +----------------------------------------------------------------------+
       3                 :    | PHP Version 5                                                        |
       4                 :    +----------------------------------------------------------------------+
       5                 :    | Copyright (c) 1997-2007 The PHP Group                                |
       6                 :    +----------------------------------------------------------------------+
       7                 :    | This source file is subject to version 3.01 of the PHP license,      |
       8                 :    | that is bundled with this package in the file LICENSE, and is        |
       9                 :    | available through the world-wide-web at the following url:           |
      10                 :    | http://www.php.net/license/3_01.txt                                  |
      11                 :    | If you did not receive a copy of the PHP license and are unable to   |
      12                 :    | obtain it through the world-wide-web, please send a note to          |
      13                 :    | license@php.net so we can mail you a copy immediately.               |
      14                 :    +----------------------------------------------------------------------+
      15                 :    | Author: Thies C. Arntzen <thies@thieso.net>                          |
      16                 :    +----------------------------------------------------------------------+
      17                 : */
      18                 : 
      19                 : /* $Id: metaphone.c,v 1.28.2.1.2.4 2007/01/01 09:36:08 sebastian Exp $ */
      20                 : 
      21                 : /*
      22                 :         Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 
      23                 : */
      24                 : 
      25                 : #include "php.h"
      26                 : #include "php_metaphone.h"
      27                 : 
      28                 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
      29                 : 
      30                 : /* {{{ proto string metaphone(string text[, int phones])
      31                 :    Break english phrases down into their phonemes */
      32                 : PHP_FUNCTION(metaphone)
      33               0 : {
      34                 :         char *str;
      35               0 :         char *result = 0;
      36                 :         int str_len;
      37               0 :         long phones = 0;
      38                 : 
      39               0 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
      40                 :                                                           &phones) == FAILURE) {
      41               0 :                 return;
      42                 :         }
      43                 : 
      44               0 :         if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
      45               0 :                 RETVAL_STRING(result, 0);
      46                 :         } else {
      47               0 :                 if (result) {
      48               0 :                         efree(result);
      49                 :                 }
      50               0 :                 RETURN_FALSE;
      51                 :         }
      52                 : }
      53                 : /* }}} */
      54                 : 
      55                 : /* 
      56                 :    this is now the original code by Michael G Schwern:
      57                 :    i've changed it just a slightly bit (use emalloc, 
      58                 :    get rid of includes etc) 
      59                 :         - thies - 13.09.1999
      60                 : */
      61                 : 
      62                 : /*-----------------------------  */
      63                 : /* this used to be "metaphone.h" */
      64                 : /*-----------------------------  */
      65                 : 
      66                 : /* Special encodings */
      67                 : #define  SH     'X'
      68                 : #define  TH             '0'
      69                 : 
      70                 : /*-----------------------------  */
      71                 : /* end of "metaphone.h"          */
      72                 : /*-----------------------------  */
      73                 : 
      74                 : /*----------------------------- */
      75                 : /* this used to be "metachar.h" */
      76                 : /*----------------------------- */
      77                 : 
      78                 : /* Metachar.h ... little bits about characters for metaphone */
      79                 : /*-- Character encoding array & accessing macros --*/
      80                 : /* Stolen directly out of the book... */
      81                 : char _codes[26] =
      82                 : {
      83                 :         1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
      84                 : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
      85                 : };
      86                 : 
      87                 : 
      88                 : #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
      89                 : 
      90                 : #define isvowel(c)  (ENCODE(c) & 1)         /* AEIOU */
      91                 : 
      92                 : /* These letters are passed through unchanged */
      93                 : #define NOCHANGE(c) (ENCODE(c) & 2)         /* FJMNR */
      94                 : 
      95                 : /* These form dipthongs when preceding H */
      96                 : #define AFFECTH(c)  (ENCODE(c) & 4)         /* CGPST */
      97                 : 
      98                 : /* These make C and G soft */
      99                 : #define MAKESOFT(c) (ENCODE(c) & 8)         /* EIY */
     100                 : 
     101                 : /* These prevent GH from becoming F */
     102                 : #define NOGHTOF(c)  (ENCODE(c) & 16)        /* BDH */
     103                 : 
     104                 : /*----------------------------- */
     105                 : /* end of "metachar.h"          */
     106                 : /*----------------------------- */
     107                 : 
     108                 : /* I suppose I could have been using a character pointer instead of
     109                 :  * accesssing the array directly... */
     110                 : 
     111                 : /* Look at the next letter in the word */
     112                 : #define Next_Letter (toupper(word[w_idx+1]))
     113                 : /* Look at the current letter in the word */
     114                 : #define Curr_Letter (toupper(word[w_idx]))
     115                 : /* Go N letters back. */
     116                 : #define Look_Back_Letter(n)     (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
     117                 : /* Previous letter.  I dunno, should this return null on failure? */
     118                 : #define Prev_Letter (Look_Back_Letter(1))
     119                 : /* Look two letters down.  It makes sure you don't walk off the string. */
     120                 : #define After_Next_Letter       (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
     121                 :                                                                                              : '\0')
     122                 : #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
     123                 : 
     124                 : 
     125                 : /* Allows us to safely look ahead an arbitrary # of letters */
     126                 : /* I probably could have just used strlen... */
     127                 : static char Lookahead(char *word, int how_far)
     128               0 : {
     129               0 :         char letter_ahead = '\0';       /* null by default */
     130                 :         int idx;
     131               0 :         for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     132                 :         /* Edge forward in the string... */
     133                 : 
     134               0 :         letter_ahead = word[idx];       /* idx will be either == to how_far or
     135                 :                                                                  * at the end of the string
     136                 :                                                                  */
     137               0 :         return letter_ahead;
     138                 : }
     139                 : 
     140                 : 
     141                 : /* phonize one letter
     142                 :  * We don't know the buffers size in advance. On way to solve this is to just
     143                 :  * re-allocate the buffer size. We're using an extra of 2 characters (this
     144                 :  * could be one though; or more too). */
     145                 : #define Phonize(c)      { \
     146                 :                                                 if (p_idx >= max_buffer_len) { \
     147                 :                                                         *phoned_word = erealloc(*phoned_word, max_buffer_len + 2); \
     148                 :                                                         max_buffer_len += 2; \
     149                 :                                                 } \
     150                 :                                                 (*phoned_word)[p_idx++] = c; \
     151                 :                                         }
     152                 : /* Slap a null character on the end of the phoned word */
     153                 : #define End_Phoned_Word {(*phoned_word)[p_idx] = '\0';}
     154                 : /* How long is the phoned word? */
     155                 : #define Phone_Len       (p_idx)
     156                 : 
     157                 : /* Note is a letter is a 'break' in the word */
     158                 : #define Isbreak(c)  (!isalpha(c))
     159                 : 
     160                 : /* {{{ metaphone
     161                 :  */
     162                 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional)
     163               0 : {
     164               0 :         int w_idx = 0;                          /* point in the phonization we're at. */
     165               0 :         int p_idx = 0;                          /* end of the phoned phrase */
     166               0 :         int max_buffer_len = 0;         /* maximum length of the destination buffer */
     167                 : 
     168                 : /*-- Parameter checks --*/
     169                 :         /* Negative phoneme length is meaningless */
     170                 : 
     171               0 :         if (max_phonemes < 0)
     172               0 :                 return -1;
     173                 : 
     174                 :         /* Empty/null string is meaningless */
     175                 :         /* Overly paranoid */
     176                 :         /* assert(word != NULL && word[0] != '\0'); */
     177                 : 
     178               0 :         if (word == NULL)
     179               0 :                 return -1;
     180                 : 
     181                 : /*-- Allocate memory for our phoned_phrase --*/
     182               0 :         if (max_phonemes == 0) {        /* Assume largest possible */
     183               0 :                 max_buffer_len = word_len;
     184               0 :                 *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
     185                 :         } else {
     186               0 :                 max_buffer_len = max_phonemes;
     187               0 :                 *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
     188                 :         }
     189                 : 
     190                 : 
     191                 : /*-- The first phoneme has to be processed specially. --*/
     192                 :         /* Find our first letter */
     193               0 :         for (; !isalpha(Curr_Letter); w_idx++) {
     194                 :                 /* On the off chance we were given nothing but crap... */
     195               0 :                 if (Curr_Letter == '\0') {
     196               0 :                         End_Phoned_Word
     197               0 :                                 return SUCCESS; /* For testing */
     198                 :                 }
     199                 :         }
     200                 : 
     201               0 :         switch (Curr_Letter) {
     202                 :                 /* AE becomes E */
     203                 :         case 'A':
     204               0 :                 if (Next_Letter == 'E') {
     205               0 :                         Phonize('E');
     206               0 :                         w_idx += 2;
     207                 :                 }
     208                 :                 /* Remember, preserve vowels at the beginning */
     209                 :                 else {
     210               0 :                         Phonize('A');
     211               0 :                         w_idx++;
     212                 :                 }
     213               0 :                 break;
     214                 :                 /* [GKP]N becomes N */
     215                 :         case 'G':
     216                 :         case 'K':
     217                 :         case 'P':
     218               0 :                 if (Next_Letter == 'N') {
     219               0 :                         Phonize('N');
     220               0 :                         w_idx += 2;
     221                 :                 }
     222               0 :                 break;
     223                 :                 /* WH becomes H, 
     224                 :                    WR becomes R 
     225                 :                    W if followed by a vowel */
     226                 :         case 'W':
     227               0 :                 if (Next_Letter == 'H' ||
     228                 :                         Next_Letter == 'R') {
     229               0 :                         Phonize(Next_Letter);
     230               0 :                         w_idx += 2;
     231               0 :                 } else if (isvowel(Next_Letter)) {
     232               0 :                         Phonize('W');
     233               0 :                         w_idx += 2;
     234                 :                 }
     235                 :                 /* else ignore */
     236               0 :                 break;
     237                 :                 /* X becomes S */
     238                 :         case 'X':
     239               0 :                 Phonize('S');
     240               0 :                 w_idx++;
     241               0 :                 break;
     242                 :                 /* Vowels are kept */
     243                 :                 /* We did A already
     244                 :                    case 'A':
     245                 :                    case 'a':
     246                 :                  */
     247                 :         case 'E':
     248                 :         case 'I':
     249                 :         case 'O':
     250                 :         case 'U':
     251               0 :                 Phonize(Curr_Letter);
     252               0 :                 w_idx++;
     253                 :                 break;
     254                 :         default:
     255                 :                 /* do nothing */
     256                 :                 break;
     257                 :         }
     258                 : 
     259                 : 
     260                 : 
     261                 :         /* On to the metaphoning */
     262               0 :         for (; Curr_Letter != '\0' &&
     263                 :                  (max_phonemes == 0 || Phone_Len < max_phonemes);
     264               0 :                  w_idx++) {
     265                 :                 /* How many letters to skip because an eariler encoding handled     
     266                 :                  * multiple letters */
     267               0 :                 unsigned short int skip_letter = 0;
     268                 : 
     269                 : 
     270                 :                 /* THOUGHT:  It would be nice if, rather than having things like...
     271                 :                  * well, SCI.  For SCI you encode the S, then have to remember
     272                 :                  * to skip the C.  So the phonome SCI invades both S and C.  It would
     273                 :                  * be better, IMHO, to skip the C from the S part of the encoding.
     274                 :                  * Hell, I'm trying it.
     275                 :                  */
     276                 : 
     277                 :                 /* Ignore non-alphas */
     278               0 :                 if (!isalpha(Curr_Letter))
     279               0 :                         continue;
     280                 : 
     281                 :                 /* Drop duplicates, except CC */
     282               0 :                 if (Curr_Letter == Prev_Letter &&
     283                 :                         Curr_Letter != 'C')
     284               0 :                         continue;
     285                 : 
     286               0 :                 switch (Curr_Letter) {
     287                 :                         /* B -> B unless in MB */
     288                 :                 case 'B':
     289               0 :                         if (Prev_Letter != 'M')
     290               0 :                                 Phonize('B');
     291               0 :                         break;
     292                 :                         /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
     293                 :                          * (SCHW is handled in S)
     294                 :                          *  S if -CI-, -CE- or -CY-
     295                 :                          *  dropped if -SCI-, SCE-, -SCY- (handed in S)
     296                 :                          *  else K
     297                 :                          */
     298                 :                 case 'C':
     299               0 :                         if (MAKESOFT(Next_Letter)) {    /* C[IEY] */
     300               0 :                                 if (After_Next_Letter == 'A' &&
     301                 :                                         Next_Letter == 'I') {   /* CIA */
     302               0 :                                         Phonize(SH);
     303                 :                                 }
     304                 :                                 /* SC[IEY] */
     305               0 :                                 else if (Prev_Letter == 'S') {
     306                 :                                         /* Dropped */
     307                 :                                 } else {
     308               0 :                                         Phonize('S');
     309                 :                                 }
     310               0 :                         } else if (Next_Letter == 'H') {
     311               0 :                                 if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {       /* Christ, School */
     312               0 :                                         Phonize('K');
     313                 :                                 } else {
     314               0 :                                         Phonize(SH);
     315                 :                                 }
     316               0 :                                 skip_letter++;
     317                 :                         } else {
     318               0 :                                 Phonize('K');
     319                 :                         }
     320               0 :                         break;
     321                 :                         /* J if in -DGE-, -DGI- or -DGY-
     322                 :                          * else T
     323                 :                          */
     324                 :                 case 'D':
     325               0 :                         if (Next_Letter == 'G' &&
     326                 :                                 MAKESOFT(After_Next_Letter)) {
     327               0 :                                 Phonize('J');
     328               0 :                                 skip_letter++;
     329                 :                         } else
     330               0 :                                 Phonize('T');
     331               0 :                         break;
     332                 :                         /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
     333                 :                          * else dropped if -GNED, -GN, 
     334                 :                          * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
     335                 :                          * else J if in -GE-, -GI, -GY and not GG
     336                 :                          * else K
     337                 :                          */
     338                 :                 case 'G':
     339               0 :                         if (Next_Letter == 'H') {
     340               0 :                                 if (!(NOGHTOF(Look_Back_Letter(3)) ||
     341                 :                                           Look_Back_Letter(4) == 'H')) {
     342               0 :                                         Phonize('F');
     343               0 :                                         skip_letter++;
     344                 :                                 } else {
     345                 :                                         /* silent */
     346                 :                                 }
     347               0 :                         } else if (Next_Letter == 'N') {
     348               0 :                                 if (Isbreak(After_Next_Letter) ||
     349                 :                                         (After_Next_Letter == 'E' &&
     350                 :                                          Look_Ahead_Letter(3) == 'D')) {
     351                 :                                         /* dropped */
     352                 :                                 } else
     353               0 :                                         Phonize('K');
     354               0 :                         } else if (MAKESOFT(Next_Letter) &&
     355                 :                                            Prev_Letter != 'G') {
     356               0 :                                 Phonize('J');
     357                 :                         } else {
     358               0 :                                 Phonize('K');
     359                 :                         }
     360               0 :                         break;
     361                 :                         /* H if before a vowel and not after C,G,P,S,T */
     362                 :                 case 'H':
     363               0 :                         if (isvowel(Next_Letter) &&
     364                 :                                 !AFFECTH(Prev_Letter))
     365               0 :                                 Phonize('H');
     366               0 :                         break;
     367                 :                         /* dropped if after C
     368                 :                          * else K
     369                 :                          */
     370                 :                 case 'K':
     371               0 :                         if (Prev_Letter != 'C')
     372               0 :                                 Phonize('K');
     373               0 :                         break;
     374                 :                         /* F if before H
     375                 :                          * else P
     376                 :                          */
     377                 :                 case 'P':
     378               0 :                         if (Next_Letter == 'H') {
     379               0 :                                 Phonize('F');
     380                 :                         } else {
     381               0 :                                 Phonize('P');
     382                 :                         }
     383               0 :                         break;
     384                 :                         /* K
     385                 :                          */
     386                 :                 case 'Q':
     387               0 :                         Phonize('K');
     388               0 :                         break;
     389                 :                         /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
     390                 :                          * else S
     391                 :                          */
     392                 :                 case 'S':
     393               0 :                         if (Next_Letter == 'I' &&
     394                 :                                 (After_Next_Letter == 'O' ||
     395                 :                                  After_Next_Letter == 'A')) {
     396               0 :                                 Phonize(SH);
     397               0 :                         } else if (Next_Letter == 'H') {
     398               0 :                                 Phonize(SH);
     399               0 :                                 skip_letter++;
     400               0 :                         } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
     401               0 :                                 Phonize(SH);
     402               0 :                                 skip_letter += 2;
     403                 :                         } else {
     404               0 :                                 Phonize('S');
     405                 :                         }
     406               0 :                         break;
     407                 :                         /* 'sh' in -TIA- or -TIO-
     408                 :                          * else 'th' before H
     409                 :                          * else T
     410                 :                          */
     411                 :                 case 'T':
     412               0 :                         if (Next_Letter == 'I' &&
     413                 :                                 (After_Next_Letter == 'O' ||
     414                 :                                  After_Next_Letter == 'A')) {
     415               0 :                                 Phonize(SH);
     416               0 :                         } else if (Next_Letter == 'H') {
     417               0 :                                 Phonize(TH);
     418               0 :                                 skip_letter++;
     419                 :                         } else {
     420               0 :                                 Phonize('T');
     421                 :                         }
     422               0 :                         break;
     423                 :                         /* F */
     424                 :                 case 'V':
     425               0 :                         Phonize('F');
     426               0 :                         break;
     427                 :                         /* W before a vowel, else dropped */
     428                 :                 case 'W':
     429               0 :                         if (isvowel(Next_Letter))
     430               0 :                                 Phonize('W');
     431               0 :                         break;
     432                 :                         /* KS */
     433                 :                 case 'X':
     434               0 :                         Phonize('K');
     435               0 :                         Phonize('S');
     436               0 :                         break;
     437                 :                         /* Y if followed by a vowel */
     438                 :                 case 'Y':
     439               0 :                         if (isvowel(Next_Letter))
     440               0 :                                 Phonize('Y');
     441               0 :                         break;
     442                 :                         /* S */
     443                 :                 case 'Z':
     444               0 :                         Phonize('S');
     445               0 :                         break;
     446                 :                         /* No transformation */
     447                 :                 case 'F':
     448                 :                 case 'J':
     449                 :                 case 'L':
     450                 :                 case 'M':
     451                 :                 case 'N':
     452                 :                 case 'R':
     453               0 :                         Phonize(Curr_Letter);
     454                 :                         break;
     455                 :                 default:
     456                 :                         /* nothing */
     457                 :                         break;
     458                 :                 }                                               /* END SWITCH */
     459                 : 
     460               0 :                 w_idx += skip_letter;
     461                 :         }                                                       /* END FOR */
     462                 : 
     463               0 :         End_Phoned_Word;
     464                 : 
     465               0 :         return 0;
     466                 : }                                                               /* END metaphone */
     467                 : /* }}} */
     468                 : 
     469                 : /*
     470                 :  * Local variables:
     471                 :  * tab-width: 4
     472                 :  * c-basic-offset: 4
     473                 :  * End:
     474                 :  * vim600: sw=4 ts=4 fdm=marker
     475                 :  * vim<600: sw=4 ts=4
     476                 :  */
Generated by: LTP GCOV extension version 1.5