-
Notifications
You must be signed in to change notification settings - Fork 0
/
pii_detect.py
49 lines (40 loc) · 2.15 KB
/
pii_detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# from nltk.tag.stanford import StanfordNERTagger
# from nltk.tokenize import word_tokenize
import re
import pandas as pd
class detection(object):
def __init__(self):
# self.standford_ner = StanfordNERTagger('./classifiers/english.all.3class.distsim.crf.ser.gz')
pass
def analyze(self,array):
self.array=array
phone_regex=re.compile("^\s*(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\s*$",re.IGNORECASE)
street_regex=re.compile("^(\\d{1,}) [a-zA-Z0-9\\s]+(\\,)?$",re.IGNORECASE)
city_regex=re.compile("^[a-zA-Z]+(\\,)?$",re.IGNORECASE)
zipcode_regex=re.compile("^[0-9]{5,6}$",re.IGNORECASE)
state_regex=re.compile("^[A-Z]{2}",re.IGNORECASE)
whole_street_regex=re.compile("^(\\d{1,}) [a-zA-Z0-9\\s]+(\\,)? [a-zA-Z]+(\\,)? [A-Z]{2} [0-9]{5,6}$",re.IGNORECASE)
# array_join=",".join([str(value) for value in self.array])
# tokenize_text=word_tokenize(array_join)
# birthday_regex=re.comp # birthday regext don't know weather needed
# tokenize_text_clean=[]
# for i in tokenize_text:
# if i != " " and i !=",":
# tokenize_text_clean.append(i)
# classified_text=self.standford_ner.tag(tokenize_text_clean)
# tags=[self.standford_ner.tag(token_val) for token_val in tokenize_text]
# print(classified_text)
array_score=[]
### First go through Common regex
for value in self.array:
street_value= street_regex.match(str(value))
phone_value=phone_regex.match(str(value))
city_value=city_regex.match(str(value))
zipcode_value=zipcode_regex.match(str(value))
state_value=state_regex.match(str(value))
whole_street_value=whole_street_regex.match(str(value))
if state_value!=None or phone_value!=None or city_value!=None or zipcode_value!=None or street_value!=None or whole_street_value!=None:
array_score.append(1)
else:
array_score.append(0)
return sum(array_score) / len(array_score)