diff --git a/README.md b/README.md index 5d5fb32..1d210f6 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,8 @@ pip install nlp-link ### Basic usage +Note: the first time you import `NLPLinker` it will take some time to load. + Match two lists in python: ```python diff --git a/nlp_link/soc_mapper/soc_map_utils.py b/nlp_link/soc_mapper/soc_map_utils.py index 0d43299..a983b68 100644 --- a/nlp_link/soc_mapper/soc_map_utils.py +++ b/nlp_link/soc_mapper/soc_map_utils.py @@ -1,9 +1,10 @@ import pandas as pd import re +import os from nlp_link import soc_mapper_config -from nlp_link.utils.utils import get_df_from_excel_url +from nlp_link.utils.utils import get_df_from_excel_s3_path def load_job_title_soc(soc_mapper_config: dict = soc_mapper_config) -> pd.DataFrame(): @@ -11,8 +12,15 @@ def load_job_title_soc(soc_mapper_config: dict = soc_mapper_config) -> pd.DataFr Load the ONS dataset which gives SOC codes for thousands of job titles """ - jobtitle_soc_data = get_df_from_excel_url( - soc_mapper_config["soc_data"]["soc_dir"], + soc_dir = soc_mapper_config["soc_data"]["soc_dir"] + dir_split = soc_dir.split("s3://")[1].split("/") + + s3_bucket_name = dir_split[0] + s3_key = os.path.join("", *dir_split[1:]) + + jobtitle_soc_data = get_df_from_excel_s3_path( + bucket_name=s3_bucket_name, + key=s3_key, sheet_name=soc_mapper_config["soc_data"]["sheet_name"], converters={ soc_mapper_config["soc_data"]["soc_2020_ext_col"]: str, diff --git a/nlp_link/utils/utils.py b/nlp_link/utils/utils.py index 90c5a84..8f64294 100644 --- a/nlp_link/utils/utils.py +++ b/nlp_link/utils/utils.py @@ -96,32 +96,20 @@ def save_json_dict(dictionary: dict, file_name: str): logger.error(f'{file_name} has wrong file extension! Only supports "*.json"') -def get_content_from_url(url: str) -> BytesIO: +def get_df_from_excel_s3_path(bucket_name: str, key: str, **kwargs) -> pd.DataFrame: """ - Get BytesIO stream from URL. - Args - url (str): URL - Returns - io.BytesIO: content of URL as BytesIO stream - """ - with requests.Session() as session: - res = session.get(url) - content = BytesIO(res.content) - return content - - -def get_df_from_excel_url(url: str, **kwargs) -> pd.DataFrame: - """ - Get dataframe from Excel file stored at URL. + Get dataframe from Excel file stored in s3 path. Args - url (str): URL location of Excel file download + path (str): S3 URI to Excel file **kwargs for pl.read_excel() - Returns pd.DataFrame: dataframe from Excel file """ - content = get_content_from_url(url) - df = pd.read_excel(content, **kwargs) + s3 = boto3.client("s3") + s3_data = s3.get_object(Bucket=bucket_name, Key=key) + contents = s3_data["Body"].read() # your Excel's essence, pretty much a stream + + df = pd.read_excel(BytesIO(contents), **kwargs) return df