Fix to loading excel from s3

nestauk · Dec 20, 2024 · 66726db · 66726db
1 parent e491d44
commit 66726db
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,8 @@ pip install nlp-link
 
 ### Basic usage
 
+Note: the first time you import `NLPLinker` it will take some time to load.
+
 Match two lists in python:
 
 ```python

diff --git a/nlp_link/soc_mapper/soc_map_utils.py b/nlp_link/soc_mapper/soc_map_utils.py
@@ -1,18 +1,26 @@
 import pandas as pd
 
 import re
+import os
 
 from nlp_link import soc_mapper_config
-from nlp_link.utils.utils import get_df_from_excel_url
+from nlp_link.utils.utils import get_df_from_excel_s3_path
 
 
 def load_job_title_soc(soc_mapper_config: dict = soc_mapper_config) -> pd.DataFrame():
     """
     Load the ONS dataset which gives SOC codes for thousands of job titles
     """
 
-    jobtitle_soc_data = get_df_from_excel_url(
-        soc_mapper_config["soc_data"]["soc_dir"],
+    soc_dir = soc_mapper_config["soc_data"]["soc_dir"]
+    dir_split = soc_dir.split("s3://")[1].split("/")
+
+    s3_bucket_name = dir_split[0]
+    s3_key = os.path.join("", *dir_split[1:])
+
+    jobtitle_soc_data = get_df_from_excel_s3_path(
+        bucket_name=s3_bucket_name,
+        key=s3_key,
         sheet_name=soc_mapper_config["soc_data"]["sheet_name"],
         converters={
             soc_mapper_config["soc_data"]["soc_2020_ext_col"]: str,

diff --git a/nlp_link/utils/utils.py b/nlp_link/utils/utils.py
@@ -96,32 +96,20 @@ def save_json_dict(dictionary: dict, file_name: str):
         logger.error(f'{file_name} has wrong file extension! Only supports "*.json"')
 
 
-def get_content_from_url(url: str) -> BytesIO:
+def get_df_from_excel_s3_path(bucket_name: str, key: str, **kwargs) -> pd.DataFrame:
     """
-    Get BytesIO stream from URL.
-    Args
-        url (str): URL
-    Returns
-        io.BytesIO: content of URL as BytesIO stream
-    """
-    with requests.Session() as session:
-        res = session.get(url)
-    content = BytesIO(res.content)
-    return content
-
-
-def get_df_from_excel_url(url: str, **kwargs) -> pd.DataFrame:
-    """
-    Get dataframe from Excel file stored at URL.
+    Get dataframe from Excel file stored in s3 path.
 
     Args
-        url (str): URL location of Excel file download
+        path (str): S3 URI to Excel file
         **kwargs for pl.read_excel()
-
     Returns
         pd.DataFrame: dataframe from Excel file
     """
-    content = get_content_from_url(url)
-    df = pd.read_excel(content, **kwargs)
 
+    s3 = boto3.client("s3")
+    s3_data = s3.get_object(Bucket=bucket_name, Key=key)
+    contents = s3_data["Body"].read()  # your Excel's essence, pretty much a stream
+
+    df = pd.read_excel(BytesIO(contents), **kwargs)
     return df