From ef0f5379aae78ce5ed92be8dd4d9792b01fc6879 Mon Sep 17 00:00:00 2001 From: Steve Peters Date: Wed, 22 Aug 2007 15:15:10 +0000 Subject: [PATCH] Upgrade to Text-Soundex-3.03 p4raw-id: //depot/perl@31749 --- ext/Text/Soundex/Soundex.pm | 73 ++++++++++++++++++--------------- ext/Text/Soundex/Soundex.xs | 98 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 118 insertions(+), 53 deletions(-) diff --git a/ext/Text/Soundex/Soundex.pm b/ext/Text/Soundex/Soundex.pm index 07630d7..5d70220 100644 --- a/ext/Text/Soundex/Soundex.pm +++ b/ext/Text/Soundex/Soundex.pm @@ -1,6 +1,6 @@ # -*- perl -*- -# (c) Copyright 1998-2003 by Mark Mielke +# (c) Copyright 1998-2007 by Mark Mielke # # Freedom to use these sources for whatever you want, as long as credit # is given where credit is due, is hereby granted. You may make modifications @@ -19,10 +19,10 @@ use XSLoader (); use strict; -our $VERSION = '3.02'; +our $VERSION = '3.03'; our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode $soundex_nocode); -our @EXPORT = qw(soundex $soundex_nocode); +our @EXPORT = qw(soundex soundex_nara $soundex_nocode); our @ISA = qw(Exporter); our $nocode; @@ -34,10 +34,10 @@ our $nocode; sub soundex_noxs { - # Strict implementation of Knuth's soundex algorithm. + # Original Soundex algorithm my @results = map { - my $code = $_; + my $code = uc($_); $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd; if (length($code)) { @@ -56,11 +56,7 @@ sub soundex_noxs sub soundex_nara { - # Implementation of NARA's soundex algorithm. If two sounds are - # identical, and separated by only an H or a W... they should be - # treated as one. This requires an additional "s///", as well as - # the "9" character code to represent H and W. ("9" works like "0" - # except it combines indentical sounds around it into one) + # US census (NARA) algorithm. my @results = map { my $code = uc($_); @@ -70,7 +66,7 @@ sub soundex_nara my $firstchar = substr($code, 0, 1); $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr] [0000990000009900111111112222222222222222333344555566]s; - $code =~ s/(.)9\1/$1/g; + $code =~ s/(.)9\1/$1/gs; ($code = substr($code, 1)) =~ tr/09//d; substr($firstchar . $code . '000', 0, 4); } else { @@ -110,12 +106,11 @@ if (defined(&soundex_xs)) { __END__ -# Implementation of soundex algorithm as described by Knuth in volume -# 3 of The Art of Computer Programming. +# Implementation of the soundex algorithm. # # Some of this documention was written by Mike Stok. # -# Knuth's test cases are: +# Examples: # # Euler, Ellery -> E460 # Gauss, Ghosh -> G200 @@ -127,28 +122,44 @@ __END__ =head1 NAME -Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth +Text::Soundex - Implementation of the soundex algorithm. =head1 SYNOPSIS - use Text::Soundex 'soundex'; + use Text::Soundex; + # Original algorithm. $code = soundex($name); # Get the soundex code for a name. @codes = soundex(@names); # Get the list of codes for a list of names. + # American Soundex variant (NARA) - Used for US census data. + $code = soundex_nara($name); # Get the soundex code for a name. + @codes = soundex_nara(@names); # Get the list of codes for a list of names. + # Redefine the value that soundex() will return if the input string # contains no identifiable sounds within it. $Text::Soundex::nocode = 'Z000'; =head1 DESCRIPTION -This module implements the soundex algorithm as described by Donald Knuth -in Volume 3 of B. The algorithm is -intended to hash words (in particular surnames) into a small space -using a simple model which approximates the sound of the word when -spoken by an English speaker. Each word is reduced to a four -character string, the first character being an upper case letter and -the remaining three being digits. +Soundex is a phonetic algorithm for indexing names by sound, as +pronounced in English. The goal is for names with the same +pronunciation to be encoded to the same representation so that they +can be matched despite minor differences in spelling. Soundex is the +most widely known of all phonetic algorithms and is often used +(incorrectly) as a synonym for "phonetic algorithm". Improvements to +Soundex are the basis for many modern phonetic algorithms. (Wikipedia, +2007) + +This module implements the original soundex algorithm developed by +Robert Russell and Margaret Odell, patented in 1918 and 1922, as well +as a variation called "American Soundex" used for US census data, and +current maintained by the National Archives and Records Administration +(NARA). + +The soundex algorithm may be recognized from Donald Knuth's +B. The algorithm described by +Knuth is the NARA algorithm. The value returned for strings which have no soundex encoding is defined using C<$Text::Soundex::nocode>. The default value is C, @@ -163,26 +174,24 @@ argument. In list context, a list is returned in which each element is the soundex code for the corresponding argument passed to C. For example, the following code assigns @codes the value C<('M200', 'S320')>: - @codes = soundex qw(Mike Stok); + @codes = soundex qw(Mike Stok); To use C to generate codes that can be used to search one -of the publically available US Censuses, a variant of the soundex() -subroutine must be used: +of the publically available US Censuses, a variant of the soundex +algorithm must be used: - use Text::Soundex 'soundex_nara'; + use Text::Soundex; $code = soundex_nara($name); -The algorithm used by the US Censuses is slightly different than that -defined by Knuth and others. The descrepancy shows up in names such as -"Ashcraft": +An example of where these algorithm differ follows: - use Text::Soundex qw(soundex soundex_nara); + use Text::Soundex; print soundex("Ashcraft"), "\n"; # prints: A226 print soundex_nara("Ashcraft"), "\n"; # prints: A261 =head1 EXAMPLES -Knuth's examples of various names and the soundex codes they map to +Donald Knuth's examples of names and the soundex codes they map to are listed below: Euler, Ellery -> E460 diff --git a/ext/Text/Soundex/Soundex.xs b/ext/Text/Soundex/Soundex.xs index 6ca8d34..1496338 100644 --- a/ext/Text/Soundex/Soundex.xs +++ b/ext/Text/Soundex/Soundex.xs @@ -21,9 +21,63 @@ # define utf8n_to_uvchr utf8_to_uv #endif -static char *soundex_table = - /*ABCDEFGHIJKLMNOPQRSTUVWXYZ*/ - "01230120022455012623010202"; +static char sv_soundex_table[0x100]; +static void sv_soundex_initialize (void) +{ + memset(&sv_soundex_table[0], '\0', sizeof(sv_soundex_table)); + sv_soundex_table['A'] = '0'; + sv_soundex_table['a'] = '0'; + sv_soundex_table['E'] = '0'; + sv_soundex_table['e'] = '0'; + sv_soundex_table['H'] = '0'; + sv_soundex_table['h'] = '0'; + sv_soundex_table['I'] = '0'; + sv_soundex_table['i'] = '0'; + sv_soundex_table['O'] = '0'; + sv_soundex_table['o'] = '0'; + sv_soundex_table['U'] = '0'; + sv_soundex_table['u'] = '0'; + sv_soundex_table['W'] = '0'; + sv_soundex_table['w'] = '0'; + sv_soundex_table['Y'] = '0'; + sv_soundex_table['y'] = '0'; + sv_soundex_table['B'] = '1'; + sv_soundex_table['b'] = '1'; + sv_soundex_table['F'] = '1'; + sv_soundex_table['f'] = '1'; + sv_soundex_table['P'] = '1'; + sv_soundex_table['p'] = '1'; + sv_soundex_table['V'] = '1'; + sv_soundex_table['v'] = '1'; + sv_soundex_table['C'] = '2'; + sv_soundex_table['c'] = '2'; + sv_soundex_table['G'] = '2'; + sv_soundex_table['g'] = '2'; + sv_soundex_table['J'] = '2'; + sv_soundex_table['j'] = '2'; + sv_soundex_table['K'] = '2'; + sv_soundex_table['k'] = '2'; + sv_soundex_table['Q'] = '2'; + sv_soundex_table['q'] = '2'; + sv_soundex_table['S'] = '2'; + sv_soundex_table['s'] = '2'; + sv_soundex_table['X'] = '2'; + sv_soundex_table['x'] = '2'; + sv_soundex_table['Z'] = '2'; + sv_soundex_table['z'] = '2'; + sv_soundex_table['D'] = '3'; + sv_soundex_table['d'] = '3'; + sv_soundex_table['T'] = '3'; + sv_soundex_table['t'] = '3'; + sv_soundex_table['L'] = '4'; + sv_soundex_table['l'] = '4'; + sv_soundex_table['M'] = '5'; + sv_soundex_table['m'] = '5'; + sv_soundex_table['N'] = '5'; + sv_soundex_table['n'] = '5'; + sv_soundex_table['R'] = '6'; + sv_soundex_table['r'] = '6'; +} static SV *sv_soundex (SV *source) { @@ -38,28 +92,27 @@ static SV *sv_soundex (SV *source) while (source_p != source_end) { - if ((*source_p & ~((UV) 0x7F)) == 0 && isalpha(*source_p)) + char codepart_last = sv_soundex_table[(unsigned char) *source_p]; + + if (codepart_last != '\0') { SV *code = newSV(SOUNDEX_ACCURACY); char *code_p = SvPVX(code); char *code_end = &code_p[SOUNDEX_ACCURACY]; - char code_last; SvCUR_set(code, SOUNDEX_ACCURACY); SvPOK_only(code); - code_last = soundex_table[(*code_p++ = toupper(*source_p++)) - 'A']; + *code_p++ = toupper(*source_p++); while (source_p != source_end && code_p != code_end) { char c = *source_p++; + char codepart = sv_soundex_table[(unsigned char) c]; - if ((c & ~((UV) 0x7F)) == 0 && isalpha(c)) - { - *code_p = soundex_table[toupper(c) - 'A']; - if (*code_p != code_last && (code_last = *code_p) != '0') - code_p++; - } + if (codepart != '\0') + if (codepart != codepart_last && (codepart_last = codepart) != '0') + *code_p++ = codepart; } while (code_p != code_end) @@ -91,31 +144,30 @@ static SV *sv_soundex_utf8 (SV* source) { STRLEN offset; UV c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); + char codepart_last = (c <= 0xFF) ? sv_soundex_table[c] : '\0'; source_p = (offset >= 1) ? &source_p[offset] : source_end; - if ((c & ~((UV) 0x7F)) == 0 && isalpha(c)) + if (codepart_last != '\0') { SV *code = newSV(SOUNDEX_ACCURACY); char *code_p = SvPVX(code); char *code_end = &code_p[SOUNDEX_ACCURACY]; - char code_last; SvCUR_set(code, SOUNDEX_ACCURACY); SvPOK_only(code); - code_last = soundex_table[(*code_p++ = toupper(c)) - 'A']; + *code_p++ = toupper(c); while (source_p != source_end && code_p != code_end) { + char codepart; c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); + codepart = (c <= 0xFF) ? sv_soundex_table[c] : '\0'; source_p = (offset >= 1) ? &source_p[offset] : source_end; - if ((c & ~((UV) 0x7F)) == 0 && isalpha(c)) - { - *code_p = soundex_table[toupper(c) - 'A']; - if (*code_p != code_last && (code_last = *code_p) != '0') - code_p++; - } + if (codepart != '\0') + if (codepart != codepart_last && (codepart_last = codepart) != '0') + *code_p++ = codepart; } while (code_p != code_end) @@ -138,6 +190,10 @@ PROTOTYPES: DISABLE void soundex_xs (...) +INIT: +{ + sv_soundex_initialize(); +} PPCODE: { int i; -- 2.7.4