Soundex Matching
From Erlang Community
| Revision as of 22:12, 18 August 2006 (edit) Cyberlync (Talk | contribs) ← Previous diff |
Revision as of 14:05, 1 September 2006 (edit) (undo) 213.171.204.166 (Talk) (→Solution) Next diff → |
||
| Line 5: | Line 5: | ||
| == Solution == | == Solution == | ||
| - | + | Use the soundex module below: | |
| - | + | ||
| - | Use the soundex | + | |
| <code> | <code> | ||
| - | > | + | > soundex:soundex("Smith"). |
| "S530" | "S530" | ||
| - | > | + | > soundex:soundex("Smyth"). |
| "S530" | "S530" | ||
| </code> | </code> | ||
| - | + | Neither the NARA Soundex and "old" Soundex are supported (soundex is an alias for soundex-nara): | |
| <code> | <code> | ||
| > (soundex-nara "Ashcraft") | > (soundex-nara "Ashcraft") | ||
| Line 23: | Line 21: | ||
| </code> | </code> | ||
| - | + | Nor can multiple Soundex keys based on prefix-skipping can be generated with the soundex-nara/prefixing, soundex-old/prefixing, and soundex/p procedures: | |
| <code> | <code> | ||
| Line 33: | Line 31: | ||
| Soundex is a string hash historically used by the US Census for indexing surnames by a function of what they "sound" like, rather than their precise spelling. Further general information on Soundex is available at http://www.archives.gov/research_room/genealogy/census/soundex.html. | Soundex is a string hash historically used by the US Census for indexing surnames by a function of what they "sound" like, rather than their precise spelling. Further general information on Soundex is available at http://www.archives.gov/research_room/genealogy/census/soundex.html. | ||
| - | + | == Implementation == | |
| <code> | <code> | ||
| - | + | -module(soundex). | |
| - | + | ||
| - | + | ||
| - | + | ||
| - | + | ||
| + | -export([soundex/1]). | ||
| + | soundex([]) -> []; | ||
| - | + | %% | |
| + | |||
| + | soundex([First|T]) -> | ||
| + | pad([First|[num(X) || X <- strip(httpd_util:to_upper(T))]]). | ||
| + | |||
| + | strip(String) -> strip(String, []). | ||
| + | strip([$A|T], Acc) -> strip(T, Acc); | ||
| + | strip([$E|T], Acc) -> strip(T, Acc); | ||
| + | strip([$H|T], Acc) -> strip(T, Acc); | ||
| + | strip([$I|T], Acc) -> strip(T, Acc); | ||
| + | strip([$O|T], Acc) -> strip(T, Acc); | ||
| + | strip([$U|T], Acc) -> strip(T, Acc); | ||
| + | strip([$W|T], Acc) -> strip(T, Acc); | ||
| + | strip([$Y|T], Acc) -> strip(T, Acc); | ||
| + | strip([32|T], Acc) -> strip(T, Acc); | ||
| + | strip([C|T], Acc) -> strip(T, [C|Acc]); | ||
| + | strip([], Acc) -> lists:reverse(Acc). | ||
| + | |||
| + | num($B) -> $1; num($F) -> $1; num($P) -> $1; num($V) -> $1; | ||
| + | num($C) -> $2; num($G) -> $2; num($J) -> $2; num($K) -> $2; | ||
| + | num($Q) -> $2; num($S) -> $2; num($X) -> $2; num($Z) -> $2; | ||
| + | num($D) -> $3; num($T) -> $3; | ||
| + | num($L) -> $4; | ||
| + | num($M) -> $5; num($N) -> $5; | ||
| + | num($R) -> $6. | ||
| + | |||
| + | dedup(String) -> dedup(String, []). | ||
| + | dedup([X,X|T], Acc) -> dedup([X|T], Acc); | ||
| + | dedup([C|T], Acc) -> dedup(T, [C|Acc]); | ||
| + | dedup([], Acc) -> lists:reverse(Acc). | ||
| + | |||
| + | pad([A,B,C,D]) -> [A,B,C,D]; | ||
| + | pad([A,B,C]) -> [A,B,C,$0]; | ||
| + | pad([A,B]) -> [A,B,$0,$0]; | ||
| + | pad([A]) -> [A,$0,$0,$0]; | ||
| + | pad([]) -> [$0,$0,$0,$0]. | ||
| + | </code> | ||
| [[Category:CookBook]] | [[Category:CookBook]] | ||
Revision as of 14:05, 1 September 2006
Problem
You want to generate Soundex hashes of surnames, for doing "sounds-like" indexing databases, or retrieving information from the US Census records and similar pre-existing databases.
Solution
Use the soundex module below:
> soundex:soundex("Smith").
"S530"
> soundex:soundex("Smyth").
"S530"
|
Neither the NARA Soundex and "old" Soundex are supported (soundex is an alias for soundex-nara):
> (soundex-nara "Ashcraft") "A261" > (soundex-old "Ashcraft") "A226" |
Nor can multiple Soundex keys based on prefix-skipping can be generated with the soundex-nara/prefixing, soundex-old/prefixing, and soundex/p procedures:
> (soundex/p "vanderlinden")
("V536" "D645" "L535")
|
Soundex is a string hash historically used by the US Census for indexing surnames by a function of what they "sound" like, rather than their precise spelling. Further general information on Soundex is available at http://www.archives.gov/research_room/genealogy/census/soundex.html.
Implementation
-module(soundex). -export([soundex/1]). soundex([]) -> []; %% soundex([First|T]) -> pad([First|[num(X) || X <- strip(httpd_util:to_upper(T))]]). strip(String) -> strip(String, []). strip([$A|T], Acc) -> strip(T, Acc); strip([$E|T], Acc) -> strip(T, Acc); strip([$H|T], Acc) -> strip(T, Acc); strip([$I|T], Acc) -> strip(T, Acc); strip([$O|T], Acc) -> strip(T, Acc); strip([$U|T], Acc) -> strip(T, Acc); strip([$W|T], Acc) -> strip(T, Acc); strip([$Y|T], Acc) -> strip(T, Acc); strip([32|T], Acc) -> strip(T, Acc); strip([C|T], Acc) -> strip(T, [C|Acc]); strip([], Acc) -> lists:reverse(Acc). num($B) -> $1; num($F) -> $1; num($P) -> $1; num($V) -> $1; num($C) -> $2; num($G) -> $2; num($J) -> $2; num($K) -> $2; num($Q) -> $2; num($S) -> $2; num($X) -> $2; num($Z) -> $2; num($D) -> $3; num($T) -> $3; num($L) -> $4; num($M) -> $5; num($N) -> $5; num($R) -> $6. dedup(String) -> dedup(String, []). dedup([X,X|T], Acc) -> dedup([X|T], Acc); dedup([C|T], Acc) -> dedup(T, [C|Acc]); dedup([], Acc) -> lists:reverse(Acc). pad([A,B,C,D]) -> [A,B,C,D]; pad([A,B,C]) -> [A,B,C,$0]; pad([A,B]) -> [A,B,$0,$0]; pad([A]) -> [A,$0,$0,$0]; pad([]) -> [$0,$0,$0,$0]. |

Digg It
Del.icio.us
Reddit
Facebook
Stumble Upon
Technorati

