Skip to content

Commit

Permalink
Merge pull request SocialFinanceDigitalLabs#216 from SocialFinanceDig…
Browse files Browse the repository at this point in the history
…italLabs/193-csww-ingest-sc

193 csww ingest sc
  • Loading branch information
patrick-troy authored Nov 8, 2023
2 parents fdc5688 + 528ee0c commit ac574a1
Show file tree
Hide file tree
Showing 33 changed files with 2,745 additions and 79 deletions.
131 changes: 128 additions & 3 deletions liiatools/datasets/cin_census/lds_cin_clean/filters.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import logging
from typing import List

import xml.etree.ElementTree as ET
import xmlschema
from xmlschema import XMLSchemaValidatorError

from sfdata_stream_parser.checks import type_check
from sfdata_stream_parser import events
from sfdata_stream_parser.collectors import collector, block_check
from sfdata_stream_parser.filters.generic import streamfilter, pass_event
from xmlschema import XMLSchemaValidatorError

log = logging.getLogger(__name__)

Expand All @@ -29,7 +30,6 @@ def add_context(event, context: List[str]):
context.pop()
else:
local_context = tuple(context)

return event.from_event(event, context=local_context)


Expand All @@ -54,6 +54,102 @@ def strip_text(event):
return None


def _create_category_dict(field: str, file: str):
"""
Create a dictionary containing the different categorical values of a given field to conform categories
e.g. {'category': [{'code': '0', 'name': 'Not an Agency Worker'}, {'code': '1', 'name': 'Agency Worker'}]}
:param field: Name of the categorical field you want to find the values for
:param file: Path to the .xsd schema containing possible categories
:return: Dictionary of categorical values and potential alternatives
"""
category_dict = {"category": []}

xsd_xml = ET.parse(file)
search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']"
element = xsd_xml.find(search_elem)

if element is not None:
search_value = f".//{{http://www.w3.org/2001/XMLSchema}}enumeration"
value = element.findall(search_value)
if value:
for v in value:
code_dict = {"code": v.get("value")}
category_dict["category"].append(code_dict)

search_doc = f".//{{http://www.w3.org/2001/XMLSchema}}documentation"
documentation = element.findall(search_doc)
for i, d in enumerate(documentation):
name_dict = {"name": d.text}
category_dict["category"][i] = {
**category_dict["category"][i],
**name_dict,
}

return category_dict

else:
return


def _create_float_dict(field: str, file: str):
float_dict = None

xsd_xml = ET.parse(file)
search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']"
element = xsd_xml.find(search_elem)

search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction"
restriction = element.findall(search_restriction)
for r in restriction:
code_dict = {
"numeric": r.get("base")[3:]
} # Remove the "xs:" from the start of the base string
if code_dict["numeric"] == "decimal":
float_dict = code_dict

search_fraction_digits = f".//{{http://www.w3.org/2001/XMLSchema}}fractionDigits"
fraction_digits = element.findall(search_fraction_digits)
for f in fraction_digits:
fraction_digits_dict = {"fixed": f.get("fixed"), "decimal": f.get("value")}
float_dict = {**float_dict, **fraction_digits_dict}

search_min_inclusive = f".//{{http://www.w3.org/2001/XMLSchema}}minInclusive"
min_inclusive = element.findall(search_min_inclusive)
for m in min_inclusive:
min_dict = {"min_inclusive": m.get("value")}
float_dict = {**float_dict, **min_dict}

search_max_inclusive = f".//{{http://www.w3.org/2001/XMLSchema}}maxInclusive"
max_inclusive = element.findall(search_max_inclusive)
for m in max_inclusive:
max_dict = {"max_inclusive": m.get("value")}
float_dict = {**float_dict, **max_dict}

return float_dict


def _create_regex_dict(field: str, file: str):
regex_dict = None

xsd_xml = ET.parse(file)
search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']"
element = xsd_xml.find(search_elem)

search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction"
restriction = element.findall(search_restriction)
for r in restriction:
if r.get("base") == "xs:string":
regex_dict = {"regex_string": None}

search_pattern = f".//{{http://www.w3.org/2001/XMLSchema}}pattern"
pattern = element.findall(search_pattern)
for p in pattern:
regex_dict["regex_string"] = p.get("value")

return regex_dict


@streamfilter()
def add_schema(event, schema: xmlschema.XMLSchema):
"""
Expand All @@ -71,9 +167,38 @@ def add_schema(event, schema: xmlschema.XMLSchema):
path = "/".join(event.context)
tag = event.context[-1]
el = schema.get_element(tag, path)

return event.from_event(event, path=path, schema=el)


@streamfilter(check=type_check(events.TextNode), fail_function=pass_event)
def add_schema_dict(event, schema_path: str):
schema_dict = None

config_type = event.schema.type.name
if config_type is not None:
if config_type[-4:] == "type":
schema_dict = _create_category_dict(config_type, schema_path)
if config_type in ["onedecimalplace", "twodecimalplaces", "ftetype"]:
schema_dict = _create_float_dict(config_type, schema_path)
if config_type in ["swetype"]:
schema_dict = _create_regex_dict(config_type, schema_path)
if config_type == "{http://www.w3.org/2001/XMLSchema}date":
schema_dict = {"date": "%Y-%m-%d"}
if config_type == "{http://www.w3.org/2001/XMLSchema}integer":
schema_dict = {"numeric": "integer"}
if config_type == "{http://www.w3.org/2001/XMLSchema}string":
schema_dict = {"string": "alphanumeric"}

if schema_dict is not None:
if event.schema.occurs[0] == 0:
schema_dict = {**schema_dict, **{"canbeblank": "yes"}}
elif event.schema.occurs[0] == 1:
schema_dict = {**schema_dict, **{"canbeblank": "no"}}

return event.from_event(event, schema_dict=schema_dict)


def _get_validation_error(schema, node) -> XMLSchemaValidatorError:
try:
schema.validate(node)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
import os
import pandas as pd
import liiatools.datasets.social_work_workforce.SWFtools.util.work_path as work_path
import liiatools.datasets.social_work_workforce.SWFtools.util.AppLogs as AppLogs


def FTESum():
"""
Calculate the sum of FTE by LEAName, YearCensus, SeniorityCode and SeniorityName from
the input csv file
:return: Excel file with the name FTESum_5d.xlsx and the same path as the input file
"""

# ===== Read file ===== #
file = "CompMergSen.csv"
requestPath = work_path.request
Expand All @@ -26,6 +34,13 @@ def FTESum():


def FTESum_2020():
"""
Read a CSV file and calculate the sum of FTE by LEAName, YearCensus, SeniorityCode and
SeniorityName for the year 2020
:return: Excel file with the name FTESum_2020.xlsx and the same path as the input file
"""

# ===== Read file ===== #
file = "CompMergSen.csv"
requestPath = work_path.request
Expand All @@ -34,14 +49,19 @@ def FTESum_2020():

df2020 = df[df["YearCensus"] == 2020]

df5D = df2020[["LEAName", "YearCensus", "SeniorityCode", "SeniorityName", "FTE"]]
if df2020.empty:
AppLogs.log("FTESum_2020 error: No data for year 2020", console_output=True)
else:
df5D = df2020[
["LEAName", "YearCensus", "SeniorityCode", "SeniorityName", "FTE"]
]

df5D = df2020.groupby(
["LEAName", "YearCensus", "SeniorityCode", "SeniorityName"]
).agg(FTESum=("FTE", "sum"))
df5D = df2020.groupby(
["LEAName", "YearCensus", "SeniorityCode", "SeniorityName"]
).agg(FTESum=("FTE", "sum"))

# ===== Save and export file ===== #
fileOutN = "FTESum_2020.xlsx"
requestPath = work_path.request
fileOut = os.path.join(requestPath, fileOutN)
df5D.to_excel(fileOut, merge_cells=False)
# ===== Save and export file ===== #
fileOutN = "FTESum_2020.xlsx"
requestPath = work_path.request
fileOut = os.path.join(requestPath, fileOutN)
df5D.to_excel(fileOut, merge_cells=False)
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@
import os
import pandas as pd
import liiatools.datasets.social_work_workforce.SWFtools.util.work_path as work_path
import liiatools.datasets.social_work_workforce.SWFtools.util.AppLogs as AppLogs


def growth_tables():
"""
Create two Excel files with tables of growth rates and population growth for six LEAs
"""

growth_rate_df = {
"LEAName": [
"Havering",
Expand All @@ -34,7 +39,7 @@ def growth_tables():
fileOut = os.path.join(requestPath, fileOutN)
growth_rate_table.to_excel(fileOut, index=False)

print("Auxiliary table: ", fileOutN, " Created")
AppLogs.log(f"Auxiliary table: {fileOutN} created", console_output=True)

"""
Population growth table: 2020 to 2026
Expand Down Expand Up @@ -66,4 +71,4 @@ def growth_tables():
fileOut = os.path.join(requestPath, fileOutN)
population_growth_table.to_excel(fileOut, index=False)

print("Auxiliary table: ", fileOutN, " Created")
AppLogs.log(f"Auxiliary table: {fileOutN} created", console_output=True)
Loading

0 comments on commit ac574a1

Please sign in to comment.