diff --git a/nlp_link/soc_mapper/soc_map_utils.py b/nlp_link/soc_mapper/soc_map_utils.py index a88b7b5..0d43299 100644 --- a/nlp_link/soc_mapper/soc_map_utils.py +++ b/nlp_link/soc_mapper/soc_map_utils.py @@ -3,6 +3,7 @@ import re from nlp_link import soc_mapper_config +from nlp_link.utils.utils import get_df_from_excel_url def load_job_title_soc(soc_mapper_config: dict = soc_mapper_config) -> pd.DataFrame(): @@ -10,7 +11,7 @@ def load_job_title_soc(soc_mapper_config: dict = soc_mapper_config) -> pd.DataFr Load the ONS dataset which gives SOC codes for thousands of job titles """ - jobtitle_soc_data = pd.read_excel( + jobtitle_soc_data = get_df_from_excel_url( soc_mapper_config["soc_data"]["soc_dir"], sheet_name=soc_mapper_config["soc_data"]["sheet_name"], converters={ @@ -81,15 +82,15 @@ def unique_soc_job_titles(jobtitle_soc_data: pd.DataFrame()) -> dict: ), axis=1, ) - jobtitle_soc_data[f"{col_name_0} and {col_name_1} and {col_name_2}"] = ( - jobtitle_soc_data.apply( - lambda x: ( - x[f"{col_name_0} and {col_name_1}"] + " " + x[col_name_2] - if pd.notnull(x[col_name_2]) - else x[f"{col_name_0} and {col_name_1}"] - ), - axis=1, - ) + jobtitle_soc_data[ + f"{col_name_0} and {col_name_1} and {col_name_2}" + ] = jobtitle_soc_data.apply( + lambda x: ( + x[f"{col_name_0} and {col_name_1}"] + " " + x[col_name_2] + if pd.notnull(x[col_name_2]) + else x[f"{col_name_0} and {col_name_1}"] + ), + axis=1, ) # Try to find a unique job title to SOC 2020 4 or 6 code mapping diff --git a/nlp_link/utils/utils.py b/nlp_link/utils/utils.py index 5772f0c..90c5a84 100644 --- a/nlp_link/utils/utils.py +++ b/nlp_link/utils/utils.py @@ -3,6 +3,9 @@ from fnmatch import fnmatch from decimal import Decimal import numpy +import requests +from io import BytesIO +import pandas as pd from nlp_link import logger @@ -91,3 +94,34 @@ def save_json_dict(dictionary: dict, file_name: str): logger.info(f"Saved to {file_name} ...") else: logger.error(f'{file_name} has wrong file extension! Only supports "*.json"') + + +def get_content_from_url(url: str) -> BytesIO: + """ + Get BytesIO stream from URL. + Args + url (str): URL + Returns + io.BytesIO: content of URL as BytesIO stream + """ + with requests.Session() as session: + res = session.get(url) + content = BytesIO(res.content) + return content + + +def get_df_from_excel_url(url: str, **kwargs) -> pd.DataFrame: + """ + Get dataframe from Excel file stored at URL. + + Args + url (str): URL location of Excel file download + **kwargs for pl.read_excel() + + Returns + pd.DataFrame: dataframe from Excel file + """ + content = get_content_from_url(url) + df = pd.read_excel(content, **kwargs) + + return df diff --git a/pyproject.toml b/pyproject.toml index 0438fec..abc67ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,9 +17,8 @@ tqdm = "^4.66.4" numpy = "^1.26.4" openpyxl = "^3.1.3" wasabi = "^1.1.3" -s3fs = {extras = ["boto3"], version = ">=2023.12.0"} -boto3 = "*" -botocore = "*" +boto3 = "^1.34.99" +botocore = "^1.34.99" [build-system] requires = ["poetry-core"]