From 2d3ea01805d1926581589e2ae840535d236699f6 Mon Sep 17 00:00:00 2001 From: lizgzil Date: Thu, 27 Jun 2024 17:53:52 +0100 Subject: [PATCH] Use latest MS model version and correct some issues in the MS process - e.g. adding both the split and unsplit version of a multiskill entity --- .../extract_skills/extract_skills.py | 21 ++++++++++++------- .../extract_skills/extract_skills_utils.py | 4 ++-- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/ojd_daps_skills/extract_skills/extract_skills.py b/ojd_daps_skills/extract_skills/extract_skills.py index c6bb27dc..d46d579a 100644 --- a/ojd_daps_skills/extract_skills/extract_skills.py +++ b/ojd_daps_skills/extract_skills/extract_skills.py @@ -73,7 +73,7 @@ def extract_skills(self, job_ads: Union[str, List[str]]) -> List[Doc]: # map skills function - def get_skills(self, job_ad: str) -> Doc: + def get_skills(self, job_ad: str, min_length: int=75) -> Doc: """Return a spaCy Doc object with entities and split 'SKILL' spans. @@ -96,12 +96,19 @@ def get_skills(self, job_ad: str) -> Doc: if ent.label_ == "SKILL": ms_pred = self.extract_config.ms_model.predict([ent.text])[0] if ms_pred == 1: - for rule in rules: - split_ent = rule(ent) - if split_ent: - all_skill_ents += split_ent # Add the list of split skills - # else, if no split, append the original entity - all_skill_ents.append(ent) + split_found = False + # Only apply splitting if the entity length isn't too long + # otherwise it can be quite an inaccurate split + if len(ent.text) <= min_length: + for rule in rules: + split_ent = rule(ent) + if split_ent: + all_skill_ents += split_ent # Add the list of split skills + split_found = True + break # stop going through rules + if not split_found: + # else, if no split, append the original entity + all_skill_ents.append(ent) else: all_skill_ents.append(ent) diff --git a/ojd_daps_skills/extract_skills/extract_skills_utils.py b/ojd_daps_skills/extract_skills/extract_skills_utils.py index 89d896b1..778cbc28 100644 --- a/ojd_daps_skills/extract_skills/extract_skills_utils.py +++ b/ojd_daps_skills/extract_skills/extract_skills_utils.py @@ -124,11 +124,11 @@ def create( # Load multi-skill model ms_model_path = PUBLIC_MODEL_FOLDER_PATH / "ms_model" try: - clf = joblib.load(ms_model_path / "multiskill-classifier8lnyq0he.pkl") + clf = joblib.load(ms_model_path / "multiskill-classifiert4_v38_0.pkl") except Exception: msg.fail("Multi-skill classifier not loaded. Downloading model...") download(repo_id=ms_model_name, dst=ms_model_path) - clf = joblib.load(ms_model_path / "multiskill-classifier8lnyq0he.pkl") + clf = joblib.load(ms_model_path / "multiskill-classifiert4_v38_0.pkl") ms_model = Pipeline( [("transformer", MultiSkillTransformer()), ("classifier", clf)]