Skip to content

Commit

Permalink
Use better method for reading from s3 that doesnt involve s3fs
Browse files Browse the repository at this point in the history
  • Loading branch information
lizgzil committed Dec 20, 2024
1 parent f4cedb2 commit e491d44
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 13 deletions.
21 changes: 11 additions & 10 deletions nlp_link/soc_mapper/soc_map_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import re

from nlp_link import soc_mapper_config
from nlp_link.utils.utils import get_df_from_excel_url


def load_job_title_soc(soc_mapper_config: dict = soc_mapper_config) -> pd.DataFrame():
"""
Load the ONS dataset which gives SOC codes for thousands of job titles
"""

jobtitle_soc_data = pd.read_excel(
jobtitle_soc_data = get_df_from_excel_url(
soc_mapper_config["soc_data"]["soc_dir"],
sheet_name=soc_mapper_config["soc_data"]["sheet_name"],
converters={
Expand Down Expand Up @@ -81,15 +82,15 @@ def unique_soc_job_titles(jobtitle_soc_data: pd.DataFrame()) -> dict:
),
axis=1,
)
jobtitle_soc_data[f"{col_name_0} and {col_name_1} and {col_name_2}"] = (
jobtitle_soc_data.apply(
lambda x: (
x[f"{col_name_0} and {col_name_1}"] + " " + x[col_name_2]
if pd.notnull(x[col_name_2])
else x[f"{col_name_0} and {col_name_1}"]
),
axis=1,
)
jobtitle_soc_data[
f"{col_name_0} and {col_name_1} and {col_name_2}"
] = jobtitle_soc_data.apply(
lambda x: (
x[f"{col_name_0} and {col_name_1}"] + " " + x[col_name_2]
if pd.notnull(x[col_name_2])
else x[f"{col_name_0} and {col_name_1}"]
),
axis=1,
)

# Try to find a unique job title to SOC 2020 4 or 6 code mapping
Expand Down
34 changes: 34 additions & 0 deletions nlp_link/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from fnmatch import fnmatch
from decimal import Decimal
import numpy
import requests
from io import BytesIO
import pandas as pd

from nlp_link import logger

Expand Down Expand Up @@ -91,3 +94,34 @@ def save_json_dict(dictionary: dict, file_name: str):
logger.info(f"Saved to {file_name} ...")
else:
logger.error(f'{file_name} has wrong file extension! Only supports "*.json"')


def get_content_from_url(url: str) -> BytesIO:
"""
Get BytesIO stream from URL.
Args
url (str): URL
Returns
io.BytesIO: content of URL as BytesIO stream
"""
with requests.Session() as session:
res = session.get(url)
content = BytesIO(res.content)
return content


def get_df_from_excel_url(url: str, **kwargs) -> pd.DataFrame:
"""
Get dataframe from Excel file stored at URL.
Args
url (str): URL location of Excel file download
**kwargs for pl.read_excel()
Returns
pd.DataFrame: dataframe from Excel file
"""
content = get_content_from_url(url)
df = pd.read_excel(content, **kwargs)

return df
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ tqdm = "^4.66.4"
numpy = "^1.26.4"
openpyxl = "^3.1.3"
wasabi = "^1.1.3"
s3fs = {extras = ["boto3"], version = ">=2023.12.0"}
boto3 = "*"
botocore = "*"
boto3 = "^1.34.99"
botocore = "^1.34.99"

[build-system]
requires = ["poetry-core"]
Expand Down

0 comments on commit e491d44

Please sign in to comment.