/* * GRESDX - grep soundex match routines * * Edits: * * 10-Oct-85 lmf Consolidate with multi-parameter match * 20-Oct-85 lmf Fix soundex algorithm * 25-Jan-86 lmf BOL/EOL match for SOUNDEX & speedup */ #define IDENT "v1.3.c" #include #include "grep.h" /* * Soundex match can be exact length or allow trailing consonants. * For example, 'john' would exact match: john, jon, jonn, and johann, * but not johnson, etc. * Define the following line to enable exact, restrictive matching * characteristic. */ /**** #define EXACT_SDX_MATCH ****/ #define SD_BOL (1) /* soundex match only at start */ #define SD_EOL (2) /* soundex match only at end */ extern int debug ; /* True for debug displays */ extern char *pat_beg ; /* Start of matched pattern */ extern char *pat_end ; /* and its end */ extern char lbuf[LMAX] ; /* Input line buffer */ int sdxflags[NPATS] ; /* special match flags: BOL, EOL */ long sdxhash[NPATS] ; /* soundex hash for patterns */ long sdxrange[NPATS] ; /* variance for short patterns */ long sdx_var ; /* variance returned by soundex() */ /*++ * SCOMPILE(pointer, index) * * Compile argument string into soundex hash and flags. Store resultant * data into soundex match vectors at index given in patno. */ scompile(p, patno) char *p ; int patno ; { long soundex() ; sdxflags[patno] = 0 ; if(*p == BOL_CH) { sdxflags[patno] |= SD_BOL ; /* flag start of line only */ p++ ; /* skip over BOL_CH */ } if(p[strlen(p)-1] == EOL_CH) { /* end of line match wanted? */ sdxflags[patno] |= SD_EOL ; /* indicate with flag word */ } sdxhash[patno] = soundex(p) ; /* collect word hash */ sdxrange[patno] = sdx_var ; /* and allowed range for short patterns */ if(debug) printf("SC: pn=%d flg=%d pat=%s\n", patno, sdxflags[patno], p ) ; } /*++ * SOUNDEX(string) * * Return the soundex hash value for the argument string. * S_V should be zero for efficiency. */ #define S_V 0 /* Vowels: a e i o u (maybe y, h, w) */ #define S_SK 1 /* "hard" consonants: s c z x k q */ #define S_TD 2 /* Dental stops: t d */ #define S_FV 3 /* f v */ #define S_GJ 4 /* g j */ #define S_B 5 /* b */ #define S_L 6 /* l */ #define S_M 7 /* m */ #define S_N 8 /* n */ #define S_P 9 /* p */ #define S_R 10 /* r */ #define S_MAX 11 /* "radix" of soundex values */ /* * The following are the hash values for each letter. */ static char sx_letters[] = { /* a b c d e f g h i j k l m */ S_V, S_B, S_SK, S_TD, S_V, S_FV, S_GJ, S_V, S_V, S_GJ, S_SK, S_L, S_M, /* n o p q r s t u v w x y z */ S_N, S_V, S_P, S_SK, S_R, S_SK, S_TD, S_V, S_FV, S_V, S_SK, S_V, S_SK }; /* * The first letter of the following consonant pairs is silent. */ static char *sx_leading_silent = "cztstzghgnknpnptpf"; /* * Compute the soundex hash for the argument string * -- somewhat modified from the original algorithm. * "Margaret K. Odell and Robert C. Russell. U.S. * patents 1261167 (1918) and 1435663 (1922)." as * reprinted in Donald Knuth, Sorting and Searching. */ long soundex(instring) char *instring ; { register int c; /* Current character */ register int next; /* Next character in the string */ register char *string ; /* working string pointer */ char *sxp; /* -> leading silent string */ int last; /* Previous character for doubles */ long value; /* Soundex value */ int temp; /* Hash for this character */ int count; /* Want only three digits */ int first ; /* First character flag */ #ifdef DEBUG if (debug) printf("SDX: %s :\n", string); #endif string = instring ; /* get register copy of pointer */ first = TRUE ; /* first character */ last = ' ' ; c = tolower(*string) ; string++ ; count = 5 ; /* Maximum consonants for this loop */ while (count && isalpha(c)) { /* build hash code */ next = tolower(*string) ; string++ ; if (next != EOS) { /* * Ignore the first letter in a silent-pair. */ for (sxp = sx_leading_silent; *sxp ; sxp++) { if (*sxp++ == c && *sxp == next) { c = next ; next = tolower(*string) ; string++ ; } } /* * Change 'ph' to 'f' */ if (c == 'p' && next == 'h') { c = 'f'; next = tolower(*string) ; string++ ; } /* * Check for consonant pairs */ if (c == next && sx_letters[c - 'a'] != S_V ) { next = tolower(*string) ; string++ ; } } if (first) { value = c - 'a' + 1 ; /* Initial hash == first letter value */ first = FALSE ; } else { if ((temp = sx_letters[c - 'a']) != S_V && c != last) { value *= S_MAX; value += temp; count-- ; } } #ifdef DEBUG if (debug) printf("%d: %c=%d %ld\n", count, c, temp, value) ; #endif last = c ; c = next ; } sdx_var = 1 ; while (count-- > 0) { /* Finish off the hash code */ value *= S_MAX ; /* Hash code */ sdx_var *= S_MAX ; /* Allowance for trailing chars */ } #ifdef DEBUG if (debug > 1) printf("SDX hash = %ld %ld\n", value, value+sdx_var); #endif return (value) ; } /*++ * SMATCH - Match the current line using soundex, return 1 if it does */ smatch(patno) int patno ; /* Argument number to match */ { register char *l; /* Line pointer */ char *ptbeg ; /* saved pattern pointer */ long sndl ; /* soundex hash code for word */ for (l = lbuf; *l; l++) { /* step through line */ while (!isalpha(*l) && *l != EOS) /* skip any leading trash */ l++ ; if((l != lbuf) && (sdxflags[patno] & SD_BOL)) /* at front of line? */ return(FALSE) ; if(*l == EOS) /* nothing worth checking */ return(FALSE) ; ptbeg = l ; /* start of match */ sndl = soundex(l) ; /* collect hash code */ while (isalpha(*l)) /* skip rest of word */ l++ ; #ifdef DEBUG if (debug) printf("%ld %ld %ld: %s\n", sdxhash[patno], sndl, sdxrange[patno], l) ; #endif #ifdef EXACT_SDX_MATCH if (sndl == sdxhash[patno]) { #else if (sdxhash[patno] <= sndl && sndl < sdxhash[patno] + sdxrange[patno]) { #endif if(*l != EOS && (sdxflags[patno] & SD_EOL)) /* not at end of line */ return(FALSE) ; pat_beg = ptbeg ; /* start of matched string */ pat_end = l ; /* end of match */ return(TRUE); /* yeah, got one */ } } return(FALSE); /* no good */ }