Skip to content

Commit

Permalink
Fix to loading excel from s3
Browse files Browse the repository at this point in the history
  • Loading branch information
lizgzil committed Dec 20, 2024
1 parent e491d44 commit 66726db
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 23 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ pip install nlp-link

### Basic usage

Note: the first time you import `NLPLinker` it will take some time to load.

Match two lists in python:

```python
Expand Down
14 changes: 11 additions & 3 deletions nlp_link/soc_mapper/soc_map_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
import pandas as pd

import re
import os

from nlp_link import soc_mapper_config
from nlp_link.utils.utils import get_df_from_excel_url
from nlp_link.utils.utils import get_df_from_excel_s3_path


def load_job_title_soc(soc_mapper_config: dict = soc_mapper_config) -> pd.DataFrame():
"""
Load the ONS dataset which gives SOC codes for thousands of job titles
"""

jobtitle_soc_data = get_df_from_excel_url(
soc_mapper_config["soc_data"]["soc_dir"],
soc_dir = soc_mapper_config["soc_data"]["soc_dir"]
dir_split = soc_dir.split("s3://")[1].split("/")

s3_bucket_name = dir_split[0]
s3_key = os.path.join("", *dir_split[1:])

jobtitle_soc_data = get_df_from_excel_s3_path(
bucket_name=s3_bucket_name,
key=s3_key,
sheet_name=soc_mapper_config["soc_data"]["sheet_name"],
converters={
soc_mapper_config["soc_data"]["soc_2020_ext_col"]: str,
Expand Down
28 changes: 8 additions & 20 deletions nlp_link/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,32 +96,20 @@ def save_json_dict(dictionary: dict, file_name: str):
logger.error(f'{file_name} has wrong file extension! Only supports "*.json"')


def get_content_from_url(url: str) -> BytesIO:
def get_df_from_excel_s3_path(bucket_name: str, key: str, **kwargs) -> pd.DataFrame:
"""
Get BytesIO stream from URL.
Args
url (str): URL
Returns
io.BytesIO: content of URL as BytesIO stream
"""
with requests.Session() as session:
res = session.get(url)
content = BytesIO(res.content)
return content


def get_df_from_excel_url(url: str, **kwargs) -> pd.DataFrame:
"""
Get dataframe from Excel file stored at URL.
Get dataframe from Excel file stored in s3 path.
Args
url (str): URL location of Excel file download
path (str): S3 URI to Excel file
**kwargs for pl.read_excel()
Returns
pd.DataFrame: dataframe from Excel file
"""
content = get_content_from_url(url)
df = pd.read_excel(content, **kwargs)

s3 = boto3.client("s3")
s3_data = s3.get_object(Bucket=bucket_name, Key=key)
contents = s3_data["Body"].read() # your Excel's essence, pretty much a stream

df = pd.read_excel(BytesIO(contents), **kwargs)
return df

0 comments on commit 66726db

Please sign in to comment.