-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtext_extractor.py
122 lines (95 loc) · 3.63 KB
/
text_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import re
from collections import defaultdict
from typing import Dict, List, Set
import fitz
import pdfplumber
import spacy
from dateutil import parser
from spacy import displacy
from urlextract import URLExtract
import text_preprocessing
NER_LABEL = {
"PERSON_NAME": "rgb(238,179,252, 1);",
"ADDRESS": "rgb(238,174,202);",
"EDUCATION": "#FFF5EE",
"GPA": "#F9F1F0;",
"SKILL": "#EF9",
"EXPERIENCE_LEVEL": "#F8AFA6",
"JOB_TITLE": "#FAEBD7",
"DATE_BIRTH": "#FFDEAD",
"MAJOR": "#FFC0CB",
"MARIAGE_STATUS": "#FFF0F5",
"ORGANIZATION": "#E0FFFF",
"GENDER": "#E0FFAA",
"LOCATION": "#FFAAFF",
}
class ResumeExtractor(object):
def __init__(self, ner_model_path: str):
self.nlp = spacy.load(ner_model_path)
def extract_text_from_pdf_file(self, file):
cv_content = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
cv_content += page.extract_text() + " "
cv_content = text_preprocessing.clean_text(cv_content)
return cv_content
def extract_text_from_pdf(self, filename):
"""Extract raw text content from pdf file
Args:
filename (_type_): _description_
"""
pdf = fitz.open(filename)
cv_text = ""
for page in pdf:
cv_text += page.get_text() + " "
cv_text = text_preprocessing.clean_text(cv_text)
return cv_text
def extract_email(self, text: str) -> List[str]:
email_token = r"[\w.+-]+@[\w-]+\.[\w.-]+"
return re.findall(email_token, text)
def extract_url(self, text: str) -> List[str]:
extractor = URLExtract()
return extractor.find_urls(text)
def extract_phone(self, text: str) -> List[str]:
#phone_token = r"[(\+?84)0]\d{9,12}\s+"
phone_token = r"[(\+?\d)0]{1,2}\s*\d{3}\s*\d{3}\s*\d{3}\b"
return re.findall(phone_token, text)
def format_date(self, date_str):
try:
date = parser.parse(date_str)
date_format = f"{date.day}-{date.month}-{date.year}"
except Exception as e:
print(str(e), e.__cause__)
date_format = date_str
return date_format
def get_summary(self, resume_path: str) -> Dict[str, Set[str]]:
dic = defaultdict(set)
resume_content = self.extract_text_from_pdf(resume_path)
doc = self.nlp(resume_content)
for token in doc.ents:
if token.label_ == "DATE_BIRTH":
dic[token.label_].add(self.format_date(token.text))
else:
dic[token.label_].add(token.text)
dic["EMAIL"] = set(self.extract_email(resume_content))
dic["PROFILE_URL"] = set(self.extract_url(resume_content))
dic["PHONE"] = set(self.extract_phone(resume_content))
return dic
def get_summary_from_text(self, resume_content: str) -> Dict[str, Set[str]]:
dic = defaultdict(set)
doc = self.nlp(resume_content)
for token in doc.ents:
if token.label_ == "DATE_BIRTH":
dic[token.label_].add(self.format_date(token.text))
else:
dic[token.label_].add(token.text)
dic["EMAIL"] = set(self.extract_email(resume_content))
dic["PROFILE_URL"] = set(self.extract_url(resume_content))
dic["PHONE"] = set(self.extract_phone(resume_content))
for key, values in dic.items():
dic[key] = list(values)
return dic
def render_html_entities(self, resume_content: str):
doc = self.nlp(resume_content)
options = {'colors': NER_LABEL}
return displacy.render(doc, 'ent', page=True, options=options)