From b9eae89ed3bfcec528cd35084af5994e90526568 Mon Sep 17 00:00:00 2001 From: Steve Peters Date: Wed, 11 Oct 2006 00:52:58 +0000 Subject: [PATCH] Grrr...moved the files, but forgot to update Soundex.pm p4raw-id: //depot/perl@28991 --- ext/Text/Soundex/Soundex.pm | 289 ++++++++++++++++++++++++++++++-------------- 1 file changed, 195 insertions(+), 94 deletions(-) diff --git a/ext/Text/Soundex/Soundex.pm b/ext/Text/Soundex/Soundex.pm index 64a9e65..07630d7 100644 --- a/ext/Text/Soundex/Soundex.pm +++ b/ext/Text/Soundex/Soundex.pm @@ -1,22 +1,122 @@ +# -*- perl -*- + +# (c) Copyright 1998-2003 by Mark Mielke +# +# Freedom to use these sources for whatever you want, as long as credit +# is given where credit is due, is hereby granted. You may make modifications +# where you see fit but leave this copyright somewhere visible. As well, try +# to initial any changes you make so that if I like the changes I can +# incorporate them into later versions. +# +# - Mark Mielke +# + package Text::Soundex; -require 5.000; -require Exporter; +require 5.006; -@ISA = qw(Exporter); -@EXPORT = qw(&soundex $soundex_nocode); +use Exporter (); +use XSLoader (); -$VERSION = '1.01'; +use strict; + +our $VERSION = '3.02'; +our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode + $soundex_nocode); +our @EXPORT = qw(soundex $soundex_nocode); +our @ISA = qw(Exporter); + +our $nocode; + +# Previous releases of Text::Soundex made $nocode available as $soundex_nocode. +# For now, this part of the interface is exported and maintained. +# In the feature, $soundex_nocode will be deprecated. +*Text::Soundex::soundex_nocode = \$nocode; + +sub soundex_noxs +{ + # Strict implementation of Knuth's soundex algorithm. + + my @results = map { + my $code = $_; + $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd; + + if (length($code)) { + my $firstchar = substr($code, 0, 1); + $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr] + [0000000000000000111111112222222222222222333344555566]s; + ($code = substr($code, 1)) =~ tr/0//d; + substr($firstchar . $code . '000', 0, 4); + } else { + $nocode; + } + } @_; + + wantarray ? @results : $results[0]; +} + +sub soundex_nara +{ + # Implementation of NARA's soundex algorithm. If two sounds are + # identical, and separated by only an H or a W... they should be + # treated as one. This requires an additional "s///", as well as + # the "9" character code to represent H and W. ("9" works like "0" + # except it combines indentical sounds around it into one) + + my @results = map { + my $code = uc($_); + $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd; + + if (length($code)) { + my $firstchar = substr($code, 0, 1); + $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr] + [0000990000009900111111112222222222222222333344555566]s; + $code =~ s/(.)9\1/$1/g; + ($code = substr($code, 1)) =~ tr/09//d; + substr($firstchar . $code . '000', 0, 4); + } else { + $nocode + } + } @_; + + wantarray ? @results : $results[0]; +} + +sub soundex_unicode +{ + require Text::Unidecode unless defined &Text::Unidecode::unidecode; + soundex(Text::Unidecode::unidecode(@_)); +} + +sub soundex_nara_unicode +{ + require Text::Unidecode unless defined &Text::Unidecode::unidecode; + soundex_nara(Text::Unidecode::unidecode(@_)); +} + +eval { XSLoader::load(__PACKAGE__, $VERSION) }; + +if (defined(&soundex_xs)) { + *soundex = \&soundex_xs; +} else { + *soundex = \&soundex_noxs; + *soundex_xs = sub { + require Carp; + Carp::croak("XS implementation of Text::Soundex::soundex_xs() ". + "could not be loaded"); + }; +} + +1; + +__END__ -# $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $ -# # Implementation of soundex algorithm as described by Knuth in volume -# 3 of The Art of Computer Programming, with ideas stolen from Ian -# Phillipps . +# 3 of The Art of Computer Programming. # -# Mike Stok , 2 March 1994. +# Some of this documention was written by Mike Stok. # # Knuth's test cases are: -# +# # Euler, Ellery -> E460 # Gauss, Ghosh -> G200 # Hilbert, Heilbronn -> H416 @@ -24,58 +124,6 @@ $VERSION = '1.01'; # Lloyd, Ladd -> L300 # Lukasiewicz, Lissajous -> L222 # -# $Log: soundex.pl,v $ -# Revision 1.2 1994/03/24 00:30:27 mike -# Subtle bug (any excuse :-) spotted by Rich Pinder -# in the way I handles leasing characters which were different but had -# the same soundex code. This showed up comparing it with Oracle's -# soundex output. -# -# Revision 1.1 1994/03/02 13:01:30 mike -# Initial revision -# -# -############################################################################## - -# $soundex_nocode is used to indicate a string doesn't have a soundex -# code, I like undef other people may want to set it to 'Z000'. - -$soundex_nocode = undef; - -sub soundex -{ - local (@s, $f, $fc, $_) = @_; - - push @s, '' unless @s; # handle no args as a single empty string - - foreach (@s) - { - $_ = uc $_; - tr/A-Z//cd; - - if ($_ eq '') - { - $_ = $soundex_nocode; - } - else - { - ($f) = /^(.)/; - tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/; - ($fc) = /^(.)/; - s/^$fc+//; - tr///cs; - tr/0//d; - $_ = $f . $_ . '000'; - s/^(.{4}).*/$1/; - } - } - - wantarray ? @s : shift @s; -} - -1; - -__END__ =head1 NAME @@ -83,37 +131,54 @@ Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth =head1 SYNOPSIS - use Text::Soundex; + use Text::Soundex 'soundex'; - $code = soundex $string; # get soundex code for a string - @codes = soundex @list; # get list of codes for list of strings + $code = soundex($name); # Get the soundex code for a name. + @codes = soundex(@names); # Get the list of codes for a list of names. - # set value to be returned for strings without soundex code - - $soundex_nocode = 'Z000'; + # Redefine the value that soundex() will return if the input string + # contains no identifiable sounds within it. + $Text::Soundex::nocode = 'Z000'; =head1 DESCRIPTION This module implements the soundex algorithm as described by Donald Knuth in Volume 3 of B. The algorithm is -intended to hash words (in particular surnames) into a small space using a -simple model which approximates the sound of the word when spoken by an English -speaker. Each word is reduced to a four character string, the first -character being an upper case letter and the remaining three being digits. +intended to hash words (in particular surnames) into a small space +using a simple model which approximates the sound of the word when +spoken by an English speaker. Each word is reduced to a four +character string, the first character being an upper case letter and +the remaining three being digits. + +The value returned for strings which have no soundex encoding is +defined using C<$Text::Soundex::nocode>. The default value is C, +however values such as C<'Z000'> are commonly used alternatives. -If there is no soundex code representation for a string then the value of -C<$soundex_nocode> is returned. This is initially set to C, but -many people seem to prefer an I value like C -(how unlikely this is depends on the data set being dealt with.) Any value -can be assigned to C<$soundex_nocode>. +For backward compatibility with older versions of this module the +C<$Text::Soundex::nocode> is exported into the caller's namespace as +C<$soundex_nocode>. -In scalar context C returns the soundex code of its first -argument, and in list context a list is returned in which each element is the -soundex code for the corresponding argument passed to C e.g. +In scalar context, C returns the soundex code of its first +argument. In list context, a list is returned in which each element is the +soundex code for the corresponding argument passed to C. For +example, the following code assigns @codes the value C<('M200', 'S320')>: @codes = soundex qw(Mike Stok); -leaves C<@codes> containing C<('M200', 'S320')>. +To use C to generate codes that can be used to search one +of the publically available US Censuses, a variant of the soundex() +subroutine must be used: + + use Text::Soundex 'soundex_nara'; + $code = soundex_nara($name); + +The algorithm used by the US Censuses is slightly different than that +defined by Knuth and others. The descrepancy shows up in names such as +"Ashcraft": + + use Text::Soundex qw(soundex soundex_nara); + print soundex("Ashcraft"), "\n"; # prints: A226 + print soundex_nara("Ashcraft"), "\n"; # prints: A261 =head1 EXAMPLES @@ -129,22 +194,58 @@ are listed below: so: - $code = soundex 'Knuth'; # $code contains 'K530' - @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' + $code = soundex 'Knuth'; # $code contains 'K530' + @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' =head1 LIMITATIONS As the soundex algorithm was originally used a B time ago in the US -it considers only the English alphabet and pronunciation. +it considers only the English alphabet and pronunciation. In particular, +non-ASCII characters will be ignored. The recommended method of dealing +with characters that have accents, or other unicode characters, is to use +the Text::Unidecode module available from CPAN. Either use the module +explicitly: + + use Text::Soundex; + use Text::Unidecode; + + print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n" + +Or use the convenient wrapper routine: + + use Text::Soundex 'soundex_unicode'; + + print soundex_unicode("Fran\xE7ais"), "\n"; # Prints "F652\n" + +Since the soundex algorithm maps a large space (strings of arbitrary +length) onto a small space (single letter plus 3 digits) no inference +can be made about the similarity of two strings which end up with the +same soundex code. For example, both C and C end +up with a soundex code of C. + +=head1 MAINTAINER + +This module is currently maintain by Mark Mielke (C). + +=head1 HISTORY + +Version 3 is a significant update to provide support for versions of +Perl later than Perl 5.004. Specifically, the XS version of the +soundex() subroutine understands strings that are encoded using UTF-8 +(unicode strings). + +Version 2 of this module was a re-write by Mark Mielke (C) +to improve the speed of the subroutines. The XS version of the soundex() +subroutine was introduced in 2.00. + +Version 1 of this module was written by Mike Stok (C) +and was included into the Perl core library set. -As it is mapping a large space (arbitrary length strings) onto a small -space (single letter plus 3 digits) no inference can be made about the -similarity of two strings which end up with the same soundex code. For -example, both C and C end up with a soundex code -of C. +Dave Carlsen (C) made the request for the NARA +algorithm to be included. The NARA soundex page can be viewed at: +C -=head1 AUTHOR +Ian Phillips (C) and Rich Pinder (C) +supplied ideas and spotted mistakes for v1.x. -This code was implemented by Mike Stok (C) from the -description given by Knuth. Ian Phillipps (C) and Rich Pinder -(C) supplied ideas and spotted mistakes. +=cut -- 2.7.4