-
Notifications
You must be signed in to change notification settings - Fork 0
/
collectors.py
38 lines (29 loc) · 1.16 KB
/
collectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import numpy as np
from fuzzywuzzy import fuzz
import csv
cluster_list = []
# Find and cluster matching names
def token_set_clustering(input_list):
checked = []
for name in input_list:
cluster = []
# Create a new list of names we haven't encountered so far + turn into a list
unchecked_list = np.setdiff1d(input_list, checked, assume_unique=True).tolist()
# Go through each 'new' name and check for similarity against current name
for unchecked_name in unchecked_list:
# Only consider if similar enough and new to this iteration
if (fuzz.token_set_ratio(name, unchecked_name) >= 80) and (unchecked_name not in checked):
checked.append(unchecked_name)
cluster.append(unchecked_name)
if len(cluster) != 0:
cluster_list.append(cluster)
# Get names from csv - should be unique
with open('coll_names.csv', encoding='utf-8') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
names = []
for row in readCSV:
names.append(row[0])
token_set_clustering(names)
for name_list in cluster_list:
print(name_list)
print(len(cluster_list))