Skip to content

Commit

Permalink
Use latest MS model version and correct some issues in the MS process…
Browse files Browse the repository at this point in the history
… - e.g. adding both the split and unsplit version of a multiskill entity
  • Loading branch information
lizgzil committed Jun 27, 2024
1 parent c25a3c5 commit 2d3ea01
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 9 deletions.
21 changes: 14 additions & 7 deletions ojd_daps_skills/extract_skills/extract_skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def extract_skills(self, job_ads: Union[str, List[str]]) -> List[Doc]:

# map skills function

def get_skills(self, job_ad: str) -> Doc:
def get_skills(self, job_ad: str, min_length: int=75) -> Doc:
"""Return a spaCy Doc object with entities
and split 'SKILL' spans.
Expand All @@ -96,12 +96,19 @@ def get_skills(self, job_ad: str) -> Doc:
if ent.label_ == "SKILL":
ms_pred = self.extract_config.ms_model.predict([ent.text])[0]
if ms_pred == 1:
for rule in rules:
split_ent = rule(ent)
if split_ent:
all_skill_ents += split_ent # Add the list of split skills
# else, if no split, append the original entity
all_skill_ents.append(ent)
split_found = False
# Only apply splitting if the entity length isn't too long
# otherwise it can be quite an inaccurate split
if len(ent.text) <= min_length:
for rule in rules:
split_ent = rule(ent)
if split_ent:
all_skill_ents += split_ent # Add the list of split skills
split_found = True
break # stop going through rules
if not split_found:
# else, if no split, append the original entity
all_skill_ents.append(ent)
else:
all_skill_ents.append(ent)

Expand Down
4 changes: 2 additions & 2 deletions ojd_daps_skills/extract_skills/extract_skills_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,11 @@ def create(
# Load multi-skill model
ms_model_path = PUBLIC_MODEL_FOLDER_PATH / "ms_model"
try:
clf = joblib.load(ms_model_path / "multiskill-classifier8lnyq0he.pkl")
clf = joblib.load(ms_model_path / "multiskill-classifiert4_v38_0.pkl")
except Exception:
msg.fail("Multi-skill classifier not loaded. Downloading model...")
download(repo_id=ms_model_name, dst=ms_model_path)
clf = joblib.load(ms_model_path / "multiskill-classifier8lnyq0he.pkl")
clf = joblib.load(ms_model_path / "multiskill-classifiert4_v38_0.pkl")

ms_model = Pipeline(
[("transformer", MultiSkillTransformer()), ("classifier", clf)]
Expand Down

0 comments on commit 2d3ea01

Please sign in to comment.