From 82d4656df665201b9ba032928e064ce3d0b00788 Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Thu, 24 Oct 2024 15:34:06 -0500 Subject: [PATCH] cmd/server: filter results based on phonetic similarly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This greatly reduces the number of jaro-winkler comparisons performed. │ before.txt │ after2.txt │ │ sec/op │ sec/op vs base │ JaroWinkler/bestPairsJaroWinkler-16 4.535µ ± ∞ ¹ 3.161µ ± ∞ ¹ ~ (p=1.000 n=1) ² │ before.txt │ after2.txt │ │ B/op │ B/op vs base │ JaroWinkler/bestPairsJaroWinkler-16 862.0 ± ∞ ¹ 359.0 ± ∞ ¹ ~ (p=1.000 n=1) ² │ before.txt │ after2.txt │ │ allocs/op │ allocs/op vs base │ JaroWinkler/bestPairsJaroWinkler-16 32.00 ± ∞ ¹ 15.00 ± ∞ ¹ ~ (p=1.000 n=1) ² --- CHANGELOG.md | 9 +++++ README.md | 1 + cmd/server/issue115_test.go | 2 +- cmd/server/new_algorithm_test.go | 4 +-- cmd/server/phonetics.go | 39 ++++++++++++++++++++ cmd/server/phonetics_test.go | 35 ++++++++++++++++++ cmd/server/search.go | 16 +++++++-- cmd/server/search_handlers_bench_test.go | 16 --------- cmd/server/search_handlers_test.go | 29 ++++++--------- cmd/server/search_test.go | 46 ++++++++++++------------ docs/usage-configuration.md | 15 ++++++-- 11 files changed, 147 insertions(+), 65 deletions(-) create mode 100644 cmd/server/phonetics.go create mode 100644 cmd/server/phonetics_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index d00a7d0b..8fd2dc79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## v0.30.0 (Released 2024-10-24) + +ADDITIONS + +Watchman now filters out indexed records based on the first character's phonetic match. This is helpful to eliminate most +low scoring results and reduces CPU usage. + +You can force scoring search terms against every indexed record by setting `DISABLE_PHONETIC_FILTERING=yes`. + ## v0.29.2 (Released 2024-10-23) IMPROVEMENTS diff --git a/README.md b/README.md index d0588f5b..2f2bd1dd 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,7 @@ PONG | `SEARCH_MAX_WORKERS` | Maximum number of goroutines used for search. | 1024 | | `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 | | `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 | +| `DISABLE_PHONETIC_FILTERING` | Force scoring search terms against every indexed record. | `false` | | `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9 | | `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3 | | `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9 | diff --git a/cmd/server/issue115_test.go b/cmd/server/issue115_test.go index fb53e158..a3c1533d 100644 --- a/cmd/server/issue115_test.go +++ b/cmd/server/issue115_test.go @@ -35,7 +35,7 @@ func TestIssue115__TopSDNs(t *testing.T) { s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "9432", SDNName: "CHIWESHE, George", SDNType: "INDIVIDUAL"}}, nil, pipe) out = s.TopSDNs(1, 0.00, "george bush", keeper) - eql(t, "issue115: top SDN 18996", out[0].match, 0.650) + eql(t, "issue115: top SDN 18996", out[0].match, 0.686) // another example s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "0", SDNName: "Bush, George W", SDNType: "INDIVIDUAL"}}, nil, pipe) diff --git a/cmd/server/new_algorithm_test.go b/cmd/server/new_algorithm_test.go index 30067228..2a24cc68 100644 --- a/cmd/server/new_algorithm_test.go +++ b/cmd/server/new_algorithm_test.go @@ -15,7 +15,7 @@ func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) { // 1. SDN Entity 40273, VLADIMIROV, Vladimir Vladimirovich oldScore, newScore := compareAlgorithms("vladimirov vladimir vladimirovich", "vladimir levenshtein") eql(t, "Score is too high", oldScore, 0.961) - eql(t, "New score is better", newScore, 0.603) + eql(t, "New score is better", newScore, 0.527) // 2. SDN Entity 7788 "SHAQIRI, Shaqir" oldScore, newScore = compareAlgorithms("shaqiri shaqir", "zaid shakir") @@ -42,7 +42,7 @@ func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) { // Words that start with different letters shouldn't match very highly oldScore, newScore = compareAlgorithms("dominguez", "jimenez") eql(t, "Score is too high", oldScore, 0.690) - eql(t, "New score is better", newScore, 0.580) + eql(t, "New score is better", newScore, 0.0) } func TestBestPairsJaroWinkler__TruePositives(t *testing.T) { diff --git a/cmd/server/phonetics.go b/cmd/server/phonetics.go new file mode 100644 index 00000000..18107a33 --- /dev/null +++ b/cmd/server/phonetics.go @@ -0,0 +1,39 @@ +package main + +import ( + "unicode" +) + +var soundexMap = map[rune]rune{ + 'A': 'A', 'E': 'A', 'I': 'A', 'O': 'A', 'U': 'A', 'Y': 'A', // vowels + 'B': 'B', 'F': 'B', 'P': 'B', 'V': 'B', // similar sounds + 'C': 'C', 'G': 'C', 'J': 'C', 'K': 'C', 'Q': 'C', 'S': 'C', 'X': 'C', 'Z': 'C', // sibilants + 'D': 'D', 'T': 'D', // dental sounds + 'L': 'L', // liquids + 'M': 'M', 'N': 'M', // nasal sounds + 'R': 'R', // trills + 'H': 'H', 'W': 'H', // breathy sounds +} + +// getPhoneticClass returns the phonetic class of the first letter in a string +func getPhoneticClass(s string) rune { + if s == "" { + return ' ' + } + // Return the first rune mapped with partial soundex + for _, r := range s { + firstLetter := unicode.ToUpper(r) + if phonetic, ok := soundexMap[firstLetter]; ok { + return phonetic + } + return firstLetter + } + return ' ' +} + +func firstCharacterSoundexMatch(s1, s2 string) bool { + if s1 == "" || s2 == "" { + return false + } + return getPhoneticClass(s1) == getPhoneticClass(s2) +} diff --git a/cmd/server/phonetics_test.go b/cmd/server/phonetics_test.go new file mode 100644 index 00000000..e25ef4b0 --- /dev/null +++ b/cmd/server/phonetics_test.go @@ -0,0 +1,35 @@ +package main + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestFirstCharacterSoundexMatch(t *testing.T) { + require.True(t, firstCharacterSoundexMatch("a", "A")) + require.True(t, firstCharacterSoundexMatch("Catherine", "Katherine")) + require.True(t, firstCharacterSoundexMatch("Fone", "Phone")) + require.True(t, firstCharacterSoundexMatch("Vibe", "Bribe")) + require.True(t, firstCharacterSoundexMatch("mine", "nine")) + + require.False(t, firstCharacterSoundexMatch("a", "")) + require.False(t, firstCharacterSoundexMatch("", "A")) + require.False(t, firstCharacterSoundexMatch("Dave", "Eve")) +} + +func TestDisablePhoneticFiltering(t *testing.T) { + search := strings.Fields("ian mckinley") + indexed := "tian xiang 7" + + t.Setenv("DISABLE_PHONETIC_FILTERING", "no") + score := bestPairsJaroWinkler(search, indexed) + require.InDelta(t, 0.00, score, 0.01) + + // Disable filtering (force the compare) + t.Setenv("DISABLE_PHONETIC_FILTERING", "yes") + + score = bestPairsJaroWinkler(search, indexed) + require.InDelta(t, 0.544, score, 0.01) +} diff --git a/cmd/server/search.go b/cmd/server/search.go index 4b61ec6b..005cb838 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -18,6 +18,7 @@ import ( "time" "github.com/moov-io/base/log" + "github.com/moov-io/base/strx" "github.com/moov-io/watchman/pkg/csl" "github.com/moov-io/watchman/pkg/dpl" "github.com/moov-io/watchman/pkg/ofac" @@ -312,12 +313,21 @@ func bestPairsJaroWinkler(searchTokens []string, indexed string) float64 { searchTokensLength := sumLength(searchTokens) indexTokensLength := sumLength(indexedTokens) + disablePhoneticFiltering := strx.Yes(os.Getenv("DISABLE_PHONETIC_FILTERING")) + //Compare each search token to each indexed token. Sort the results in descending order - scores := make([]Score, 0, len(searchTokens)+len(indexedTokens)) + scoresCapacity := (len(searchTokens) + len(indexedTokens)) + if !disablePhoneticFiltering { + scoresCapacity /= 5 // reduce the capacity as many terms don't phonetically match + } + scores := make([]Score, 0, scoresCapacity) for searchIdx, searchToken := range searchTokens { for indexIdx, indexedToken := range indexedTokens { - score := customJaroWinkler(indexedToken, searchToken) - scores = append(scores, Score{score, searchIdx, indexIdx}) + // Compare the first letters phonetically and only run jaro-winkler on those which are similar + if disablePhoneticFiltering || firstCharacterSoundexMatch(indexedToken, searchToken) { + score := customJaroWinkler(indexedToken, searchToken) + scores = append(scores, Score{score, searchIdx, indexIdx}) + } } } sort.Slice(scores[:], func(i, j int) bool { diff --git a/cmd/server/search_handlers_bench_test.go b/cmd/server/search_handlers_bench_test.go index 0170296f..0a22735d 100644 --- a/cmd/server/search_handlers_bench_test.go +++ b/cmd/server/search_handlers_bench_test.go @@ -84,19 +84,3 @@ func BenchmarkJaroWinkler(b *testing.B) { } }) } - -// goos: darwin -// goarch: amd64 -// pkg: github.com/moov-io/watchman/cmd/server -// cpu: Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz -// BenchmarkSearchHandler-16 2728 131 213 518 ns/op 34812129 B/op 1486792 allocs/op -// PASS -// ok github.com/moov-io/watchman/cmd/server 413.248s - -// goos: darwin -// goarch: amd64 -// pkg: github.com/moov-io/watchman/cmd/server -// cpu: Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz -// BenchmarkSearchHandler-16 2079 174 594 246 ns/op 49797019 B/op 1638732 allocs/op -// PASS -// ok github.com/moov-io/watchman/cmd/server 419.284s diff --git a/cmd/server/search_handlers_test.go b/cmd/server/search_handlers_test.go index db9ee0af..aa2e2582 100644 --- a/cmd/server/search_handlers_test.go +++ b/cmd/server/search_handlers_test.go @@ -241,13 +241,14 @@ func TestSearch__NameAndAltName(t *testing.T) { // OFAC require.Equal(t, "2681", wrapper.SDNs[0].EntityID) - require.Equal(t, "4691", wrapper.AltNames[0].EntityID) + require.Equal(t, "HAWATMA, Nayif", wrapper.SDNs[0].SDNName) + require.Equal(t, "559", wrapper.AltNames[0].EntityID) require.Equal(t, "735", wrapper.Addresses[0].EntityID) - require.Equal(t, "18782", wrapper.SectoralSanctions[0].EntityID) + require.Equal(t, "18736", wrapper.SectoralSanctions[0].EntityID) // BIS require.Equal(t, "P.O. BOX 28360", wrapper.DeniedPersons[0].StreetAddress) - require.Equal(t, "Luqman Yasin Yunus Shgragi", wrapper.BISEntities[0].Name) + require.Equal(t, "Mohammad Jan Khan Mangal", wrapper.BISEntities[0].Name) } func TestSearch__Name(t *testing.T) { @@ -288,21 +289,13 @@ func TestSearch__Name(t *testing.T) { t.Fatalf("SDNs=%d Alts=%d SSIs=%d DPs=%d ELs=%d", len(wrapper.SDNs), len(wrapper.Alts), len(wrapper.SSIs), len(wrapper.DPs), len(wrapper.ELs)) } - if wrapper.SDNs[0].EntityID != "2676" { - t.Errorf("%#v", wrapper.SDNs[0]) - } - if wrapper.Alts[0].EntityID != "4691" { - t.Errorf("%#v", wrapper.Alts[0]) - } - if wrapper.SSIs[0].EntityID != "18782" { - t.Errorf("%#v", wrapper.SSIs[0]) - } - if wrapper.DPs[0].Name != "AL NASER WINGS AIRLINES" { - t.Errorf("%#v", wrapper.DPs[0]) - } - if wrapper.ELs[0].Name != "Luqman Yasin Yunus Shgragi" { - t.Errorf("%#v", wrapper.ELs[0]) - } + + require.Equal(t, "2676", wrapper.SDNs[0].EntityID) + require.Equal(t, "4691", wrapper.Alts[0].EntityID) + + require.Equal(t, "18736", wrapper.SSIs[0].EntityID) + require.Equal(t, "AL NASER WINGS AIRLINES", wrapper.DPs[0].Name) + require.Equal(t, "Luqman Yasin Yunus Shgragi", wrapper.ELs[0].Name) } func TestSearch__AltName(t *testing.T) { diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go index b656f30c..244f9683 100644 --- a/cmd/server/search_test.go +++ b/cmd/server/search_test.go @@ -441,12 +441,12 @@ func TestJaroWinkler(t *testing.T) { {strings.ToLower("WEI Zhao"), precompute("WEI, Zhao"), 1.0}, // apply jaroWinkler in both directions - {"jane doe", "jan lahore", 0.596}, - {"jan lahore", "jane doe", 0.596}, + {"jane doe", "jan lahore", 0.439}, + {"jan lahore", "jane doe", 0.549}, // real world case - {"john doe", "paul john", 0.533}, - {"john doe", "john othername", 0.672}, + {"john doe", "paul john", 0.624}, + {"john doe", "john othername", 0.440}, // close match {"jane doe", "jane doe2", 0.940}, @@ -465,12 +465,12 @@ func TestJaroWinkler(t *testing.T) { {"iap", "ian mckinley", 0.352}, {"ian mckinley", "ian", 0.891}, {"ian mckinley", "iap", 0.733}, - {"ian mckinley", "tian xiang 7", 0.526}, - {"bindaree food group pty", precompute("independent insurance group ltd"), 0.576}, // precompute removes ltd - {"bindaree food group pty ltd", "independent insurance group ltd", 0.631}, // only matches higher from 'ltd' - {"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.658}, - {"zincum llc", "easy verification inc.", 0.380}, - {"transpetrochart co ltd", "jx metals trading co.", 0.496}, + {"ian mckinley", "tian xiang 7", 0.000}, + {"bindaree food group pty", precompute("independent insurance group ltd"), 0.269}, // precompute removes ltd + {"bindaree food group pty ltd", "independent insurance group ltd", 0.401}, // only matches higher from 'ltd' + {"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.514}, + {"zincum llc", "easy verification inc.", 0.000}, + {"transpetrochart co ltd", "jx metals trading co.", 0.431}, {"technolab", "moomoo technologies inc", 0.565}, {"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.480}, {"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.094}, @@ -494,7 +494,7 @@ func TestJaroWinkler(t *testing.T) { {"nicolas, maduro moros", "nicolás maduro", 0.906}, {"africada financial services bureau change", "skylight", 0.441}, {"africada financial services bureau change", "skylight financial inc", 0.658}, - {"africada financial services bureau change", "skylight services inc", 0.621}, + {"africada financial services bureau change", "skylight services inc", 0.599}, {"africada financial services bureau change", "skylight financial services", 0.761}, {"africada financial services bureau change", "skylight financial services inc", 0.730}, @@ -503,29 +503,29 @@ func TestJaroWinkler(t *testing.T) { {precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.682}, {"group preservation holy sites", "bridgespan group", 0.652}, - {"the group for the preservation of the holy sites", "the logan group", 0.730}, - {precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.730}, - {"group preservation holy sites", "logan group", 0.649}, + {"the group for the preservation of the holy sites", "the logan group", 0.670}, + {precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.670}, + {"group preservation holy sites", "logan group", 0.586}, - {"the group for the preservation of the holy sites", "the anything group", 0.698}, - {precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.698}, - {"group preservation holy sites", "anything group", 0.585}, + {"the group for the preservation of the holy sites", "the anything group", 0.546}, + {precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.546}, + {"group preservation holy sites", "anything group", 0.488}, - {"the group for the preservation of the holy sites", "the hello world group", 0.706}, - {precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.706}, - {"group preservation holy sites", "hello world group", 0.560}, + {"the group for the preservation of the holy sites", "the hello world group", 0.637}, + {precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.637}, + {"group preservation holy sites", "hello world group", 0.577}, {"the group for the preservation of the holy sites", "the group", 0.880}, {precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.880}, {"group preservation holy sites", "group", 0.879}, - {"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.426}, + {"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.345}, { precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), - 0.446, + 0.366, }, - {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.334}, + {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.263}, // precompute {"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.858}, diff --git a/docs/usage-configuration.md b/docs/usage-configuration.md index 7b9ff5c9..f4094b97 100644 --- a/docs/usage-configuration.md +++ b/docs/usage-configuration.md @@ -15,6 +15,11 @@ menubar: docs-menu | `SEARCH_MAX_WORKERS` | Maximum number of goroutines used for search. | 1024 | | `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 | | `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 | +| `DISABLE_PHONETIC_FILTERING` | Force scoring search terms against every indexed record. | `false` | +| `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9 | +| `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3 | +| `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9 | +| `UNMATCHED_INDEX_TOKEN_WEIGHT` | Weight of penalty applied to scores when part of the indexed name isn't matched. | 0.15 | | `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 | | `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 | | `LOG_FORMAT` | Format for logging lines to be written as. | Options: `json`, `plain` - Default: `plain` | @@ -34,9 +39,15 @@ menubar: docs-menu |-----|-----|-----| | `OFAC_DOWNLOAD_TEMPLATE` | HTTP address for downloading raw OFAC files. | `https://www.treasury.gov/ofac/downloads/%s` | | `DPL_DOWNLOAD_TEMPLATE` | HTTP address for downloading the DPL. | `https://www.bis.doc.gov/dpl/%s` | -| `CSL_DOWNLOAD_TEMPLATE` | HTTP address for downloading the Consolidated Screening List (CSL), which is a collection of US government sanctions lists. | `https://api.trade.gov/consolidated_screening_list/%s` | +| `EU_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading EU Consolidated Screening List | Subresource of `webgate.ec.europa.eu` | +| `WITH_EU_SCREENING_LIST` | Download and parse the EU Consolidated Screening List | Default: `true` | +| `UK_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading UK Consolidated Screening List | Subresource of `www.gov.uk` | +| `UK_SANCTIONS_LIST_URL` | Use an alternate URL for downloading UK Sanctions List | Subresource of `www.gov.uk` | +| `WITH_UK_SANCTIONS_LIST` | Download and parse the UK Sanctions List on startup. | Default: `false` | +| `US_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading US Consolidated Screening List | Subresource of `api.trade.gov` | +| `CSL_DOWNLOAD_TEMPLATE` | Same as `US_CSL_DOWNLOAD_URL` | | | `KEEP_STOPWORDS` | Boolean to keep stopwords in names. | `false` | -| `DEBUG_NAME_PIPELINE` | Boolean to pring debug messages for each name (SDN, SSI) processing step. | `false` | +| `DEBUG_NAME_PIPELINE` | Boolean to print debug messages for each name (SDN, SSI) processing step. | `false` | ## Data persistence