From ec208b9f4a2b0cf6b20c77dba2565aaf9226d71c Mon Sep 17 00:00:00 2001 From: India Kerle Date: Fri, 10 May 2024 10:15:31 +0100 Subject: [PATCH] debug --- .../extract_skills/extract_skills.py | 5 ++++- .../format_taxonomy/esco_formatting.py | 3 +-- .../hard_coded_mapper_formatting.py | 3 +-- ojd_daps_skills/map_skills/skill_mapper.py | 17 +++++++++-------- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/ojd_daps_skills/extract_skills/extract_skills.py b/ojd_daps_skills/extract_skills/extract_skills.py index 0b2cacce..db124758 100644 --- a/ojd_daps_skills/extract_skills/extract_skills.py +++ b/ojd_daps_skills/extract_skills/extract_skills.py @@ -7,7 +7,10 @@ from ojd_daps_skills import setup_spacy_extensions from ojd_daps_skills.extract_skills.extract_skills_utils import ExtractConfig from ojd_daps_skills.extract_skills.multiskill_rules import ( - _split_duplicate_object, _split_duplicate_verb, _split_skill_mentions) + _split_duplicate_object, + _split_duplicate_verb, + _split_skill_mentions, +) from ojd_daps_skills.map_skills.skill_mapper import SkillsMapper from ojd_daps_skills.map_skills.skill_mapper_utils import MapConfig from ojd_daps_skills.utils.text_cleaning import clean_text, short_hash diff --git a/ojd_daps_skills/map_skills/format_taxonomy/esco_formatting.py b/ojd_daps_skills/map_skills/format_taxonomy/esco_formatting.py index e03c5612..c4cbe61e 100644 --- a/ojd_daps_skills/map_skills/format_taxonomy/esco_formatting.py +++ b/ojd_daps_skills/map_skills/format_taxonomy/esco_formatting.py @@ -18,8 +18,7 @@ from wasabi import msg from ojd_daps_skills import bucket_name -from ojd_daps_skills.utils.data_getters import (get_s3_resource, load_s3_data, - save_to_s3) +from ojd_daps_skills.utils.data_getters import get_s3_resource, load_s3_data, save_to_s3 def find_lev_0(code): diff --git a/ojd_daps_skills/map_skills/format_taxonomy/hard_coded_mapper_formatting.py b/ojd_daps_skills/map_skills/format_taxonomy/hard_coded_mapper_formatting.py index fa4d050b..c7eabae5 100644 --- a/ojd_daps_skills/map_skills/format_taxonomy/hard_coded_mapper_formatting.py +++ b/ojd_daps_skills/map_skills/format_taxonomy/hard_coded_mapper_formatting.py @@ -3,8 +3,7 @@ """ from ojd_daps_skills import bucket_name -from ojd_daps_skills.utils.data_getters import (get_s3_resource, load_s3_data, - save_to_s3) +from ojd_daps_skills.utils.data_getters import get_s3_resource, load_s3_data, save_to_s3 from ojd_daps_skills.utils.text_cleaning import short_hash if __name__ == "__main__": diff --git a/ojd_daps_skills/map_skills/skill_mapper.py b/ojd_daps_skills/map_skills/skill_mapper.py index b4c30c86..b0f88d57 100644 --- a/ojd_daps_skills/map_skills/skill_mapper.py +++ b/ojd_daps_skills/map_skills/skill_mapper.py @@ -11,7 +11,10 @@ from ojd_daps_skills import setup_spacy_extensions from ojd_daps_skills.map_skills.skill_mapper_utils import ( - MapConfig, get_most_common_code, get_top_comparisons) + MapConfig, + get_most_common_code, + get_top_comparisons, +) from ojd_daps_skills.utils.text_cleaning import clean_text, short_hash setup_spacy_extensions() @@ -126,17 +129,17 @@ def get_embeddings( """ all_skills = list(chain.from_iterable([doc._.skill_spans for doc in job_ads])) all_skills_unique = list(set(all_skills)) - + if not isinstance(self.config.hard_coded_taxonomy, dict): self.config.hard_coded_taxonomy = {} - + self.all_skills_unique_dict = {} for skill in all_skills_unique: skill_clean = clean_text(skill) skill_hash = short_hash(skill_clean) if not self.config.hard_coded_taxonomy.get(skill_hash): self.all_skills_unique_dict[skill_hash] = skill_clean - + skill_embeddings = self.config.bert_model.transform( list(self.all_skills_unique_dict.values()) ) @@ -169,17 +172,15 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]: skill_embeddings, taxonomy_embeddings_dict = self.get_embeddings(job_ads) - ( top_skill_indxs, top_skill_scores, tax_skills_ix, ) = self.get_top_taxonomy_skills(skill_embeddings, taxonomy_embeddings_dict) - + print("top_skill_indxs", top_skill_indxs) print("top_skill_scores", top_skill_scores) print("tax_skills_ix", tax_skills_ix) - if self.config.taxonomy_config.get("skill_hier_info_col"): top_hier_skills, hier_types = self.get_top_hierarchy_skills( @@ -247,7 +248,7 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]: ) skill_mapper_list.append(match_results) - + return skill_mapper_list def match_skills(self, job_ads: List[Doc]) -> Dict[int, dict]: