-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathResumeExtractor.py
220 lines (167 loc) · 7.72 KB
/
ResumeExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import fitz # PyMuPDF
from flask import Flask, render_template, request, redirect, url_for,session
from werkzeug.utils import secure_filename
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from dotenv import load_dotenv
import os
import re
import requests
app = Flask(__name__)
# Set Flask's secret key using the environment variable
app.secret_key = os.getenv('SECRET_KEY')
# Directory to save uploaded PDFs
UPLOAD_FOLDER = "uploads"
app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER
# Allowed file extensions
ALLOWED_EXTENSIONS = {"pdf"}
# Function to check if file extension is allowed
def allowed_file(filename):
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
# Step 2: Extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path) # Open the PDF file
text = ""
for page in doc: # Loop through each page and extract text
text += page.get_text("text") # Extract text as plain text
return text
# Step 3: Use Mistral AI to generate a response (or keyword extraction)
# Load Mistral model for text generation (use authentication if needed)
# Authenticate HuggingFace with a token
# If you need to use a token, uncomment the line below.
# Load .env file
load_dotenv()
# Access token from environment variable
token = os.getenv("HUGGINGFACE_TOKEN")
# Login to Hugging Face
login(token=token)
# Step 3: Use the DistilGPT-2 model to generate text or keywords
# Load model and tokenizer separately and pass the token explicitly
model = AutoModelForCausalLM.from_pretrained("distilgpt2", use_auth_token=token)
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_auth_token=token)
# Initialize the pipeline using the loaded model and tokenizer
mistral = pipeline("text-generation", model=model, tokenizer=tokenizer)
def extract_keywords_from_text(text):
# Define a dictionary to hold the extracted data
keywords = {
'name': None,
'location': None,
'skills': None,
'experience': None,
'education': None,
'projects': None
}
# Clean the text by removing extra spaces and newlines
cleaned_text = " ".join(text.splitlines())
# Step 1: Extract name from the first line (title)
# We'll focus on the first line or a few initial lines that are most likely to contain the name
first_line = cleaned_text.split(' ')[0] # Take the first word as the first line
# Regex pattern to capture capitalized names (first and last names)
name_pattern = r"([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)" # Matches First Last name
# Try to extract name from the first line
name_match = re.match(name_pattern, first_line)
if name_match:
keywords['name'] = name_match.group(1).strip()
# Step 2: Use Mistral AI to generate keywords from the resume text
response = mistral(text, max_length=500) # You can adjust the max_length
extracted_text = response[0]["generated_text"]
# Regex patterns for extracting sections (basic examples)
location_pattern = r"(?i)Location:?\s*(.*)"
skills_pattern = r"(?i)Skills?:?\s*(.*)"
experience_pattern = r"(?i)Experience?:?\s*(.*)"
education_pattern = r"(?i)Education?:?\s*(.*)"
projects_pattern = r"(?i)Projects?:?\s*(.*)"
# Extract the name
location_match = re.search(location_pattern, extracted_text)
if location_match:
keywords['location'] = location_match.group(1).strip()
# Extract the skills
skills_match = re.search(skills_pattern, extracted_text)
if skills_match:
keywords['skills'] = skills_match.group(1).strip()
# Extract the experience
experience_match = re.search(experience_pattern, extracted_text)
if experience_match:
keywords['experience'] = experience_match.group(1).strip()
# Extract the education
education_match = re.search(education_pattern, extracted_text)
if education_match:
keywords['education'] = education_match.group(1).strip()
# Extract the projects
projects_match = re.search(projects_pattern, extracted_text)
if projects_match:
keywords['projects'] = projects_match.group(1).strip()
return keywords
@app.route("/")
def home():
return render_template("index.html")
@app.route("/upload", methods=["POST"])
def upload_file():
if "file" not in request.files:
return redirect(request.url)
file = request.files["file"]
if file.filename == "":
return redirect(request.url)
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
file.save(file_path)
# Step 1: Extract text from the uploaded PDF
resume_text = extract_text_from_pdf(file_path)
# Step 2: Use Mistral AI to extract keywords and insights from the resume text
extracted_keywords = extract_keywords_from_text(resume_text)
# Step 3: Display extracted details in a form or response
return render_template("form.html", keywords=extracted_keywords)
return redirect(request.url)
@app.route("/submit",methods=["POST"])
def submit_form():
# Get form data from the POST request
name = request.form.get('name')
location = request.form.get('location')
skills = request.form.get('skills')
experience = request.form.get('experience')
education = request.form.get('education')
projects = request.form.get('projects')
# Store the submitted form data in session to use later
session['submitted_data'] = {
'name': name,
'location': location,
'skills': skills,
'experience': experience,
'education': education,
'projects': projects
}
# For now, we will just print the data and redirect back to the home page
print(f"Name: {name}, Skills: {skills}, Experience: {experience}, Education: {education}, Projects: {projects}")
return redirect(url_for('linkdin'))
@app.route('/linkdin')
def linkdin():
# Retrieve the submitted data from session
submitted_data = session.get('submitted_data', {})
# Extract skill and location data from the session
skills = submitted_data.get('skills', '')
location = submitted_data.get('location', '')
# Get the first skill from the comma-separated list of skills
skills_list = [skill.strip() for skill in skills.split(',')]
first_skill = skills_list[0] if skills_list else None
print(first_skill)
if not first_skill:
return "No Skill provided to search jobs"
# Prepare the LinkedIn API request based on skills and location
career_keyword = first_skill # We use the "skills" field as the job keyword
location = location if location else "anywhere" # Default to "anywhere" if no location is provided
url = "https://linkedin-api8.p.rapidapi.com/search-jobs"
#querystring = {"keywords":"golang","locationId":"92000000","datePosted":"anyTime","sort":"mostRelevant"}
querystring = {"keywords":career_keyword,"locationId":"92000000","datePosted":"anyTime","sort":"mostRelevant"}
headers = {
#"x-rapidapi-key": "Paste your Rapid API Key",
"x-rapidapi-host": "linkedin-api8.p.rapidapi.com"
}
response = requests.get(url, headers=headers, params=querystring)
print(f"Response code is :{response.status_code} ");
data = response.json()
print(data)
# Pass data to the template
return render_template('linkdin.html', jobs=data,user=session.get('user'))
if __name__ == "__main__":
app.run(debug=True)