Use better method for reading from s3 that doesnt involve s3fs

nestauk · Dec 20, 2024 · e491d44 · e491d44
1 parent f4cedb2
commit e491d44
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 13 deletions.
diff --git a/nlp_link/soc_mapper/soc_map_utils.py b/nlp_link/soc_mapper/soc_map_utils.py
@@ -3,14 +3,15 @@
 import re
 
 from nlp_link import soc_mapper_config
+from nlp_link.utils.utils import get_df_from_excel_url
 
 
 def load_job_title_soc(soc_mapper_config: dict = soc_mapper_config) -> pd.DataFrame():
     """
     Load the ONS dataset which gives SOC codes for thousands of job titles
     """
 
-    jobtitle_soc_data = pd.read_excel(
+    jobtitle_soc_data = get_df_from_excel_url(
         soc_mapper_config["soc_data"]["soc_dir"],
         sheet_name=soc_mapper_config["soc_data"]["sheet_name"],
         converters={
@@ -81,15 +82,15 @@ def unique_soc_job_titles(jobtitle_soc_data: pd.DataFrame()) -> dict:
         ),
         axis=1,
     )
-    jobtitle_soc_data[f"{col_name_0} and {col_name_1} and {col_name_2}"] = (
-        jobtitle_soc_data.apply(
-            lambda x: (
-                x[f"{col_name_0} and {col_name_1}"] + " " + x[col_name_2]
-                if pd.notnull(x[col_name_2])
-                else x[f"{col_name_0} and {col_name_1}"]
-            ),
-            axis=1,
-        )
+    jobtitle_soc_data[
+        f"{col_name_0} and {col_name_1} and {col_name_2}"
+    ] = jobtitle_soc_data.apply(
+        lambda x: (
+            x[f"{col_name_0} and {col_name_1}"] + " " + x[col_name_2]
+            if pd.notnull(x[col_name_2])
+            else x[f"{col_name_0} and {col_name_1}"]
+        ),
+        axis=1,
     )
 
     # Try to find a unique job title to SOC 2020 4 or 6 code mapping

diff --git a/nlp_link/utils/utils.py b/nlp_link/utils/utils.py
@@ -3,6 +3,9 @@
 from fnmatch import fnmatch
 from decimal import Decimal
 import numpy
+import requests
+from io import BytesIO
+import pandas as pd
 
 from nlp_link import logger
 
@@ -91,3 +94,34 @@ def save_json_dict(dictionary: dict, file_name: str):
         logger.info(f"Saved to {file_name} ...")
     else:
         logger.error(f'{file_name} has wrong file extension! Only supports "*.json"')
+
+
+def get_content_from_url(url: str) -> BytesIO:
+    """
+    Get BytesIO stream from URL.
+    Args
+        url (str): URL
+    Returns
+        io.BytesIO: content of URL as BytesIO stream
+    """
+    with requests.Session() as session:
+        res = session.get(url)
+    content = BytesIO(res.content)
+    return content
+
+
+def get_df_from_excel_url(url: str, **kwargs) -> pd.DataFrame:
+    """
+    Get dataframe from Excel file stored at URL.
+
+    Args
+        url (str): URL location of Excel file download
+        **kwargs for pl.read_excel()
+
+    Returns
+        pd.DataFrame: dataframe from Excel file
+    """
+    content = get_content_from_url(url)
+    df = pd.read_excel(content, **kwargs)
+
+    return df
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,9 +17,8 @@ tqdm = "^4.66.4"
 numpy = "^1.26.4"
 openpyxl = "^3.1.3"
 wasabi = "^1.1.3"
-s3fs = {extras = ["boto3"], version = ">=2023.12.0"}
-boto3 = "*"
-botocore = "*"
+boto3 = "^1.34.99"
+botocore = "^1.34.99"
 
 [build-system]
 requires = ["poetry-core"]