Skip to content

Commit

Permalink
update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
India Kerle committed May 8, 2024
1 parent b55432e commit 321f09a
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 15 deletions.
74 changes: 74 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,77 @@
To install as a package:

```
pipx install poetry
poetry shell
poetry install
```

To extract skills from a job advert:

```
from ojd_daps_skills.extract_skills.extract_skills import SkillsExtractor
sm = SkillsExtractor(taxonomy_name="toy")
✘ nestauk/en_skillner NER model not loaded. Downloading model...
Collecting en-skillner==any
Downloading https://huggingface.co/nestauk/en_skillner/resolve/main/en_skillner-any-py3-none-any.whl (587.7 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 587.7/587.7 MB 5.1 MB/s eta 0:00:0000:0100:01
Installing collected packages: en-skillner
Successfully installed en-skillner-3.7.1
✘ Multi-skill classifier not loaded. Downloading model...
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 26843.55it/s]
✘ Neccessary data files are not downloaded. Downloading ~0.5GB of
neccessary data files to
/Users/india.kerlenesta/Projects/nesta/ojd_daps/ojd_daps_extension/ojd_daps_skills/ojd_daps_skills_data.
ℹ Data folder downloaded from
/Users/india.kerlenesta/Projects/nesta/ojd_daps/ojd_daps_extension/ojd_daps_skills/ojd_daps_skills_data
job_ad = "You should be skilled in Python, Java and R."
job_ad_with_skills = sm(job_ad)
ℹ Getting embeddings for 3 texts ...
ℹ Took 0.018199920654296875 seconds
```

To access the extracted and mapped skills:

```
job_ad_with_skills_doc = job_ad_with_skills[0]
#print raw ents (i.e. multiskills are not split, also include 'BENEFIT' and 'EXPERIENCE' spans)
job_ad_with_skills_doc.ents
>> (Python, Java, R.)
#print SKILL spans (where SKILL spans are predicted as multiskills, split them)
job_ad_with_skills._.skill_spans
>> [Python, Java, R.]
#print mapped skills to the "toy" taxonomy
job_ad_with_skills._.mapped_skills
>> [{'ojo_skill': 'Python',
'ojo_skill_id': 2232581233191055,
'match_skill': 'working with computers',
'match_score': 0.75,
'match_type': 'most_common_level_1',
'match_id': 'S5'},
{'ojo_skill': 'Java',
'ojo_skill_id': 2833100423969322,
'match_skill': 'working with computers',
'match_score': 0.6666666666666666,
'match_type': 'most_common_level_1',
'match_id': 'S5'},
{'ojo_skill': 'R.',
'ojo_skill_id': 8622187230313821,
'match_skill': 'working with computers',
'match_score': 0.6666666666666666,
'match_type': 'most_common_level_1',
'match_id': 'S5'}]
```

To run tests:

```
pytest tests/
```
18 changes: 13 additions & 5 deletions ojd_daps_skills/extract_skills/extract_skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ojd_daps_skills.extract_skills.multiskill_rules import (
_split_duplicate_object, _split_duplicate_verb, _split_skill_mentions)
from ojd_daps_skills.map_skills.skill_mapper import SkillsMapper
from ojd_daps_skills.map_skills.skill_mapper_utils import MapConfig
from ojd_daps_skills.utils.text_cleaning import clean_text, short_hash

setup_spacy_extensions()
Expand All @@ -30,15 +31,22 @@ class SkillsExtractor(BaseModel):

def __init__(
self,
taxonomy_name: str = "toy",
ner_model_name: str = "nestauk/en_skillner",
ms_model_name: str = "nestauk/multiskill-classifier",
):
super().__init__()
self._extract_config: ExtractConfig = ExtractConfig.create(
super().__init__(
taxonomy_name=taxonomy_name,
ner_model_name=ner_model_name,
ms_model_name=ms_model_name,
)
# Initialize additional properties if needed
self._extract_config = ExtractConfig.create(
ner_model_name=self.ner_model_name,
ms_model_name=self.ms_model_name,
)
self._skill_mapper: SkillsMapper = SkillsMapper(
taxonomy_name=self.taxonomy_name
)
self._map_config = MapConfig.create(taxonomy_name=self.taxonomy_name)
self._skill_mapper = SkillsMapper(config=self._map_config)

def extract_skills(self, job_ads: Union[str, List[str]]) -> List[Doc]:
"""Return a list of spaCy Doc objects with entities
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
|---|---|---|---|
id: A unique id for the skill/hierarchy
description: The skill/hierarchy level description text
description: The skill/hierarchy level description texts
type: What column name the skill/hier description is from (category, subcategory)
hierarchy_levels: If a skill then which hierarchy levels is it in
Expand Down Expand Up @@ -155,10 +155,10 @@ def remove_bad_hierarchy_levels(hierarchy_levels):
lightcast_formatted = pd.concat(
[all_skills, category_skills, subcategory_skills]
).reset_index(drop=True)
lightcast_formatted["hierarchy_levels"] = (
lightcast_formatted.hierarchy_levels.apply(map_subcategory_ids).apply(
remove_bad_hierarchy_levels
)
lightcast_formatted[
"hierarchy_levels"
] = lightcast_formatted.hierarchy_levels.apply(map_subcategory_ids).apply(
remove_bad_hierarchy_levels
)
lightcast_formatted = lightcast_formatted.query("description.notna()").query(
'description != "NULL"'
Expand Down
6 changes: 3 additions & 3 deletions ojd_daps_skills/map_skills/skill_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,9 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]:
high_hier_codes += [hier_level] * round(sim_score * 10)
high_tax_skills_results = {}
for hier_level in range(self.config.taxonomy_config["num_hier_levels"]):
high_tax_skills_results["most_common_level_" + str(hier_level)] = (
get_most_common_code(high_hier_codes, hier_level)
)
high_tax_skills_results[
"most_common_level_" + str(hier_level)
] = get_most_common_code(high_hier_codes, hier_level)

if high_tax_skills_results:
match_results["high_tax_skills"] = high_tax_skills_results
Expand Down
4 changes: 2 additions & 2 deletions ojd_daps_skills/map_skills/skill_mapper_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ class MapConfig(BaseModel):
taxonomy_data: pd.DataFrame
taxonomy_embeddings: Optional[Dict[int, np.array]]
hier_mapper: Dict[str, str]
hard_coded_taxonomy: Optional[Dict[int, dict]]
previous_skill_matches: Optional[Dict[int, str]]
hard_coded_taxonomy: Union[Dict[int, Any], None]
previous_skill_matches: Union[Dict[int, Any], None]

class Config:
arbitrary_types_allowed = True
Expand Down

0 comments on commit 321f09a

Please sign in to comment.