-
Notifications
You must be signed in to change notification settings - Fork 1
/
查重.py
55 lines (49 loc) · 2.14 KB
/
查重.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#Check duplicates between two big dataset.
#两个万级别条数数据集间的查重
#List查重更加高效
#Import packages
import difflib
import pandas as pd
import numpy as np
from nameparser import HumanName
import string
from titlecase import titlecase
import re
from difflib import SequenceMatcher
#Import data
df1 = pd.read_excel('excel1.xlsx',header = None, encoding = 'utf-8') #Import dataset without columns' names.
list1 = df1[0] #Save the first column as list1
df2 = pd.read_excel('excel2.xlsx',header = 0, encoding = 'utf-8') #Import dataset with the first row as columns' names.
list2 = df2['Name'] #Save the column named 'Name' as list2
#Add/Insert space between numbers and letters
#在数字和字母之间加空格
def spacer(text):
return re.sub(r'([0-9])([a-zA-Z])',r"\1 \2",text,re.MULTILINE).strip()
#Clean data. Uppercase all the Words. Deduplicate the new list.
for i in range(0,len(list1)):
list1[i] = list1[i].replace(". ",'.')
list1[i] = spacer(list1[i])
list1[i] = titlecase(list1[i])
list1 = list1.drop_duplicates(keep='first', inplace=False)
list1 = list1.reset_index(drop=True)
#pd.DataFrame(list1).to_excel('deduplicatelist.xlsx', header=False, index=False)
# Find item of list1 in list2 with similarity of > 95%
# 在 list2 中找到与 list1 中元素相似度达到95%及以上的元素
common = []
for item in list1:
comm = difflib.get_close_matches(item,list2,1,cutoff=0.95)
#difflib 对于两个超大list找相似元素十分高效。
#用fuzzywuzzy 处理上万级别的数据集比对需要以小时计数处理时间。
#同样的数据集,用difflib大概需要不到五分钟。
common.append(comm)
comcom = pd.DataFrame(common)
comcom = comcom.mask(comcom.eq('None')).dropna().reset_index(drop=True) #Drop all the rows with value 'None'.
for i in range(0,len(comcom)):
comcom[0][i] = spacer(comcom[0][i])
#pd.DataFrame(comcom).to_excel('include.xlsx', header=False, index=False)
# 找到list1和list2不能匹配的list1的部分
a = set(list1)
b = set(comcom[0])
diff = list(a-b)
diff = pd.DataFrame(diff)
#pd.DataFrame(diff).to_excel('notinclude.xlsx', header=False, index=False)