Soundex Matching

From Erlang Community

(Difference between revisions)
Revision as of 22:12, 18 August 2006 (edit)
Cyberlync (Talk | contribs)

← Previous diff
Revision as of 14:05, 1 September 2006 (edit) (undo)
213.171.204.166 (Talk)
(Solution)
Next diff →
Line 5: Line 5:
== Solution == == Solution ==
-Note: This library does not exist yet. Scheme data shown for the time being: +Use the soundex module below:
- +
-Use the soundex library: +
<code> <code>
-> (soundex "Smith")+> soundex:soundex("Smith").
"S530" "S530"
-> (soundex "Smyth")+> soundex:soundex("Smyth").
"S530" "S530"
</code> </code>
-Both current NARA Soundex and "old" Soundex are supported (soundex is an alias for soundex-nara): +Neither the NARA Soundex and "old" Soundex are supported (soundex is an alias for soundex-nara):
<code> <code>
> (soundex-nara "Ashcraft") > (soundex-nara "Ashcraft")
Line 23: Line 21:
</code> </code>
-Multiple Soundex keys based on prefix-skipping can be generated with the soundex-nara/prefixing, soundex-old/prefixing, and soundex/p procedures: +Nor can multiple Soundex keys based on prefix-skipping can be generated with the soundex-nara/prefixing, soundex-old/prefixing, and soundex/p procedures:
<code> <code>
Line 33: Line 31:
Soundex is a string hash historically used by the US Census for indexing surnames by a function of what they "sound" like, rather than their precise spelling. Further general information on Soundex is available at http://www.archives.gov/research_room/genealogy/census/soundex.html. Soundex is a string hash historically used by the US Census for indexing surnames by a function of what they "sound" like, rather than their precise spelling. Further general information on Soundex is available at http://www.archives.gov/research_room/genealogy/census/soundex.html.
-Soundex keys are represented as four-character strings, therefore the equal? procedure can be used to compare them: +== Implementation ==
<code> <code>
-> (equal? (soundex "Johnson") (soundex "Jackson"))+-module(soundex).
-#f+
-> (equal? (soundex "Johnson") (soundex "JANZEN"))+
-#t+
-</code>+
 +-export([soundex/1]).
 +soundex([]) -> [];
-This doesn't apply to Erlang, and is only here as a placeholder until the library is implemented. Coming to a Jungerl near you... +%%
 + 
 +soundex([First|T]) ->
 + pad([First|[num(X) || X <- strip(httpd_util:to_upper(T))]]).
 +
 +strip(String) -> strip(String, []).
 +strip([$A|T], Acc) -> strip(T, Acc);
 +strip([$E|T], Acc) -> strip(T, Acc);
 +strip([$H|T], Acc) -> strip(T, Acc);
 +strip([$I|T], Acc) -> strip(T, Acc);
 +strip([$O|T], Acc) -> strip(T, Acc);
 +strip([$U|T], Acc) -> strip(T, Acc);
 +strip([$W|T], Acc) -> strip(T, Acc);
 +strip([$Y|T], Acc) -> strip(T, Acc);
 +strip([32|T], Acc) -> strip(T, Acc);
 +strip([C|T], Acc) -> strip(T, [C|Acc]);
 +strip([], Acc) -> lists:reverse(Acc).
 + 
 +num($B) -> $1; num($F) -> $1; num($P) -> $1; num($V) -> $1;
 +num($C) -> $2; num($G) -> $2; num($J) -> $2; num($K) -> $2;
 +num($Q) -> $2; num($S) -> $2; num($X) -> $2; num($Z) -> $2;
 +num($D) -> $3; num($T) -> $3;
 +num($L) -> $4;
 +num($M) -> $5; num($N) -> $5;
 +num($R) -> $6.
 + 
 +dedup(String) -> dedup(String, []).
 +dedup([X,X|T], Acc) -> dedup([X|T], Acc);
 +dedup([C|T], Acc) -> dedup(T, [C|Acc]);
 +dedup([], Acc) -> lists:reverse(Acc).
 + 
 +pad([A,B,C,D]) -> [A,B,C,D];
 +pad([A,B,C]) -> [A,B,C,$0];
 +pad([A,B]) -> [A,B,$0,$0];
 +pad([A]) -> [A,$0,$0,$0];
 +pad([]) -> [$0,$0,$0,$0].
 +</code>
[[Category:CookBook]] [[Category:CookBook]]

Revision as of 14:05, 1 September 2006

Problem

You want to generate Soundex hashes of surnames, for doing "sounds-like" indexing databases, or retrieving information from the US Census records and similar pre-existing databases.

Solution

Use the soundex module below:

> soundex:soundex("Smith").
"S530"
> soundex:soundex("Smyth").
"S530"

Neither the NARA Soundex and "old" Soundex are supported (soundex is an alias for soundex-nara):

> (soundex-nara "Ashcraft")
"A261"
> (soundex-old "Ashcraft")
"A226"

Nor can multiple Soundex keys based on prefix-skipping can be generated with the soundex-nara/prefixing, soundex-old/prefixing, and soundex/p procedures:

> (soundex/p "vanderlinden")
("V536" "D645" "L535")


Soundex is a string hash historically used by the US Census for indexing surnames by a function of what they "sound" like, rather than their precise spelling. Further general information on Soundex is available at http://www.archives.gov/research_room/genealogy/census/soundex.html.

Implementation

-module(soundex).

-export([soundex/1]).

soundex([]) -> [];

%%

soundex([First|T]) ->
	pad([First|[num(X) || X <- strip(httpd_util:to_upper(T))]]).
	
strip(String) -> strip(String, []).
strip([$A|T], Acc) -> strip(T, Acc);
strip([$E|T], Acc) -> strip(T, Acc);
strip([$H|T], Acc) -> strip(T, Acc);
strip([$I|T], Acc) -> strip(T, Acc);
strip([$O|T], Acc) -> strip(T, Acc);
strip([$U|T], Acc) -> strip(T, Acc);
strip([$W|T], Acc) -> strip(T, Acc);
strip([$Y|T], Acc) -> strip(T, Acc);
strip([32|T], Acc) -> strip(T, Acc);
strip([C|T], Acc)  -> strip(T, [C|Acc]);
strip([], Acc) -> lists:reverse(Acc).

num($B) -> $1; num($F) -> $1; num($P) -> $1; num($V) -> $1;
num($C) -> $2; num($G) -> $2; num($J) -> $2; num($K) -> $2; 
num($Q) -> $2; num($S) -> $2; num($X) -> $2; num($Z) -> $2;
num($D) -> $3; num($T) -> $3;
num($L) -> $4;
num($M) -> $5; num($N) -> $5;
num($R) -> $6.

dedup(String) -> dedup(String, []).
dedup([X,X|T], Acc) -> dedup([X|T], Acc);
dedup([C|T], Acc) -> dedup(T, [C|Acc]);
dedup([], Acc) -> lists:reverse(Acc).

pad([A,B,C,D]) -> [A,B,C,D];
pad([A,B,C])   -> [A,B,C,$0];
pad([A,B])     -> [A,B,$0,$0];
pad([A])       -> [A,$0,$0,$0];
pad([])        -> [$0,$0,$0,$0].
Erlang/OTP Projects
Personal tools