Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimental Improved Search Algorithm #524

Merged
merged 7 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,18 @@ You should get this response:
PONG
```

### Configuration settings
### Configuration settings

| Environmental Variable | Description | Default |
|-----|-----|-----|
| `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h |
| `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
| `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
| `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9 |
| `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3 |
| `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9 |
| `UNMATCHED_INDEX_TOKEN_WEIGHT` | Weight of penalty applied to scores when part of the indexed name isn't matched. | 0.15 |
| `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
| `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
| `WEBHOOK_BATCH_SIZE` | How many watches to read from database per batch of async searches. | 100 |
Expand Down
6 changes: 3 additions & 3 deletions cmd/server/issue115_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ func TestIssue115__TopSDNs(t *testing.T) {
s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "2680", SDNName: "HABBASH, George", SDNType: "INDIVIDUAL"}}, nil, pipe)

out := s.TopSDNs(1, 0.00, "george bush", keeper)
eql(t, "issue115: top SDN 2680", out[0].match, 0.732)
eql(t, "issue115: top SDN 2680", out[0].match, 0.687)

// was 88.3% match
s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "9432", SDNName: "CHIWESHE, George", SDNType: "INDIVIDUAL"}}, nil, pipe)

out = s.TopSDNs(1, 0.00, "george bush", keeper)
eql(t, "issue115: top SDN 18996", out[0].match, 0.764)
eql(t, "issue115: top SDN 18996", out[0].match, 0.650)

// another example
s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "0", SDNName: "Bush, George W", SDNType: "INDIVIDUAL"}}, nil, pipe)
Expand All @@ -47,5 +47,5 @@ func TestIssue115__TopSDNs(t *testing.T) {
eql(t, "issue115: top SDN 0", out[0].match, 1.0)

out = s.TopSDNs(1, 0.00, "george bush", keeper)
eql(t, "issue115: top SDN 0", out[0].match, 0.667)
eql(t, "issue115: top SDN 0", out[0].match, 0.986)
}
76 changes: 76 additions & 0 deletions cmd/server/new_algorithm_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright 2022 The Moov Authors
// Use of this source code is governed by an Apache License
// license that can be found in the LICENSE file.

package main

import (
"strings"
"testing"
)

func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) {
// Words in the query should be matched against at most one indexed word. Doubled names on the sanctioned list can
// skew results
// 1. SDN Entity 40273, VLADIMIROV, Vladimir Vladimirovich
oldScore, newScore := compareAlgorithms("vladimirov vladimir vladimirovich", "vladimir levenshtein")
eql(t, "Score is too high", oldScore, 0.961)
eql(t, "New score is better", newScore, 0.603)

// 2. SDN Entity 7788 "SHAQIRI, Shaqir"
oldScore, newScore = compareAlgorithms("shaqiri shaqir", "zaid shakir")
eql(t, "Score is too high", oldScore, 0.908)
eql(t, "New score is better", newScore, 0.704)

// Single-word sanctioned names shouldn't match any query with that name part
// 1. SDN Entity 15050 "HADI"
oldScore, newScore = compareAlgorithms("hadi", "hadi alwai")
eql(t, "Score is too high", oldScore, 0.900)
eql(t, "New score is better", newScore, 0.615)

// Name-part scores should be weighted by the character length. If not, small words can have unfair weight
// 1. SDN Entity "LI, Shangfu"
oldScore, newScore = compareAlgorithms("li shangfu", "li shanlan")
eql(t, "Score is too high", oldScore, 0.914)
eql(t, "New score is better", newScore, 0.867)

// Words with different lengths shouldn't match very highly
oldScore, newScore = compareAlgorithms("browningweight", "brown")
eql(t, "Score is too high", oldScore, 0.871)
eql(t, "New score is better", newScore, 0.703)

// Words that start with different letters shouldn't match very highly
oldScore, newScore = compareAlgorithms("dominguez", "jimenez")
eql(t, "Score is too high", oldScore, 0.690)
eql(t, "New score is better", newScore, 0.580)
}

func TestBestPairsJaroWinkler__TruePositives(t *testing.T) {
// Unmatched indexed words had a large weight, causing false negatives for missing "middle names"
// 1. Saddam Hussein
oldScore, newScore := compareAlgorithms("saddam hussein al tikriti", "saddam hussien")
eql(t, "Score is too low", oldScore, 0.656)
eql(t, "New score is better", newScore, 0.924)

// 2. SDN Entity 7574 "VALENCIA TRUJILLO, Joaquin Mario"
oldScore, newScore = compareAlgorithms("valencia trujillo joaquin mario", "valencia trujillo joaquin")
eql(t, "Score is too low", oldScore, 0.868)
eql(t, "New score is better", newScore, 0.973)

// 3. SDN Entity 9760 "LUKASHENKO, Alexander Grigoryevich"
oldScore, newScore = compareAlgorithms("lukashenko alexander grigoryevich", "alexander lukashenko")
eql(t, "Score is too low", oldScore, 0.765)
eql(t, "New score is better", newScore, 0.942)

// Small words had too much weight, causing false negatives
// 1. SDN Entity 4691 "A.I.C. SOGO KENKYUSHO"
oldScore, newScore = compareAlgorithms("a i c sogo kenkyusho", "sogo kenkyusho")
eql(t, "Score is too low", oldScore, 0.400)
eql(t, "New score is better", newScore, 0.972)
}

func compareAlgorithms(indexedName string, query string) (float64, float64) {
oldScore := jaroWinkler(indexedName, query)
newScore := bestPairsJaroWinkler(strings.Fields(query), indexedName)
return oldScore, newScore
}
Loading
Loading