diff --git a/ojd_daps_skills/extract_skills/extract_skills.py b/ojd_daps_skills/extract_skills/extract_skills.py index c6bb27dc..d46d579a 100644 --- a/ojd_daps_skills/extract_skills/extract_skills.py +++ b/ojd_daps_skills/extract_skills/extract_skills.py @@ -73,7 +73,7 @@ def extract_skills(self, job_ads: Union[str, List[str]]) -> List[Doc]: # map skills function - def get_skills(self, job_ad: str) -> Doc: + def get_skills(self, job_ad: str, min_length: int=75) -> Doc: """Return a spaCy Doc object with entities and split 'SKILL' spans. @@ -96,12 +96,19 @@ def get_skills(self, job_ad: str) -> Doc: if ent.label_ == "SKILL": ms_pred = self.extract_config.ms_model.predict([ent.text])[0] if ms_pred == 1: - for rule in rules: - split_ent = rule(ent) - if split_ent: - all_skill_ents += split_ent # Add the list of split skills - # else, if no split, append the original entity - all_skill_ents.append(ent) + split_found = False + # Only apply splitting if the entity length isn't too long + # otherwise it can be quite an inaccurate split + if len(ent.text) <= min_length: + for rule in rules: + split_ent = rule(ent) + if split_ent: + all_skill_ents += split_ent # Add the list of split skills + split_found = True + break # stop going through rules + if not split_found: + # else, if no split, append the original entity + all_skill_ents.append(ent) else: all_skill_ents.append(ent) diff --git a/ojd_daps_skills/extract_skills/extract_skills_utils.py b/ojd_daps_skills/extract_skills/extract_skills_utils.py index 89d896b1..778cbc28 100644 --- a/ojd_daps_skills/extract_skills/extract_skills_utils.py +++ b/ojd_daps_skills/extract_skills/extract_skills_utils.py @@ -124,11 +124,11 @@ def create( # Load multi-skill model ms_model_path = PUBLIC_MODEL_FOLDER_PATH / "ms_model" try: - clf = joblib.load(ms_model_path / "multiskill-classifier8lnyq0he.pkl") + clf = joblib.load(ms_model_path / "multiskill-classifiert4_v38_0.pkl") except Exception: msg.fail("Multi-skill classifier not loaded. Downloading model...") download(repo_id=ms_model_name, dst=ms_model_path) - clf = joblib.load(ms_model_path / "multiskill-classifier8lnyq0he.pkl") + clf = joblib.load(ms_model_path / "multiskill-classifiert4_v38_0.pkl") ms_model = Pipeline( [("transformer", MultiSkillTransformer()), ("classifier", clf)]