Merge pull request SocialFinanceDigitalLabs#216 from SocialFinanceDig…

…italLabs/193-csww-ingest-sc 193 csww ingest sc
SocialFinanceDigitalLabs · Nov 8, 2023 · ac574a1 · ac574a1
2 parents fdc5688 + 528ee0c
commit ac574a1
Show file tree

Hide file tree

Showing 33 changed files with 2,745 additions and 79 deletions.
diff --git a/liiatools/datasets/cin_census/lds_cin_clean/filters.py b/liiatools/datasets/cin_census/lds_cin_clean/filters.py
@@ -1,12 +1,13 @@
 import logging
 from typing import List
-
+import xml.etree.ElementTree as ET
 import xmlschema
+from xmlschema import XMLSchemaValidatorError
+
 from sfdata_stream_parser.checks import type_check
 from sfdata_stream_parser import events
 from sfdata_stream_parser.collectors import collector, block_check
 from sfdata_stream_parser.filters.generic import streamfilter, pass_event
-from xmlschema import XMLSchemaValidatorError
 
 log = logging.getLogger(__name__)
 
@@ -29,7 +30,6 @@ def add_context(event, context: List[str]):
         context.pop()
     else:
         local_context = tuple(context)
-
     return event.from_event(event, context=local_context)
 
 
@@ -54,6 +54,102 @@ def strip_text(event):
         return None
 
 
+def _create_category_dict(field: str, file: str):
+    """
+    Create a dictionary containing the different categorical values of a given field to conform categories
+    e.g. {'category': [{'code': '0', 'name': 'Not an Agency Worker'}, {'code': '1', 'name': 'Agency Worker'}]}
+
+    :param field: Name of the categorical field you want to find the values for
+    :param file: Path to the .xsd schema containing possible categories
+    :return: Dictionary of categorical values and potential alternatives
+    """
+    category_dict = {"category": []}
+
+    xsd_xml = ET.parse(file)
+    search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']"
+    element = xsd_xml.find(search_elem)
+
+    if element is not None:
+        search_value = f".//{{http://www.w3.org/2001/XMLSchema}}enumeration"
+        value = element.findall(search_value)
+        if value:
+            for v in value:
+                code_dict = {"code": v.get("value")}
+                category_dict["category"].append(code_dict)
+
+            search_doc = f".//{{http://www.w3.org/2001/XMLSchema}}documentation"
+            documentation = element.findall(search_doc)
+            for i, d in enumerate(documentation):
+                name_dict = {"name": d.text}
+                category_dict["category"][i] = {
+                    **category_dict["category"][i],
+                    **name_dict,
+                }
+
+            return category_dict
+
+    else:
+        return
+
+
+def _create_float_dict(field: str, file: str):
+    float_dict = None
+
+    xsd_xml = ET.parse(file)
+    search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']"
+    element = xsd_xml.find(search_elem)
+
+    search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction"
+    restriction = element.findall(search_restriction)
+    for r in restriction:
+        code_dict = {
+            "numeric": r.get("base")[3:]
+        }  # Remove the "xs:" from the start of the base string
+        if code_dict["numeric"] == "decimal":
+            float_dict = code_dict
+
+    search_fraction_digits = f".//{{http://www.w3.org/2001/XMLSchema}}fractionDigits"
+    fraction_digits = element.findall(search_fraction_digits)
+    for f in fraction_digits:
+        fraction_digits_dict = {"fixed": f.get("fixed"), "decimal": f.get("value")}
+        float_dict = {**float_dict, **fraction_digits_dict}
+
+    search_min_inclusive = f".//{{http://www.w3.org/2001/XMLSchema}}minInclusive"
+    min_inclusive = element.findall(search_min_inclusive)
+    for m in min_inclusive:
+        min_dict = {"min_inclusive": m.get("value")}
+        float_dict = {**float_dict, **min_dict}
+
+    search_max_inclusive = f".//{{http://www.w3.org/2001/XMLSchema}}maxInclusive"
+    max_inclusive = element.findall(search_max_inclusive)
+    for m in max_inclusive:
+        max_dict = {"max_inclusive": m.get("value")}
+        float_dict = {**float_dict, **max_dict}
+
+    return float_dict
+
+
+def _create_regex_dict(field: str, file: str):
+    regex_dict = None
+
+    xsd_xml = ET.parse(file)
+    search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']"
+    element = xsd_xml.find(search_elem)
+
+    search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction"
+    restriction = element.findall(search_restriction)
+    for r in restriction:
+        if r.get("base") == "xs:string":
+            regex_dict = {"regex_string": None}
+
+        search_pattern = f".//{{http://www.w3.org/2001/XMLSchema}}pattern"
+        pattern = element.findall(search_pattern)
+        for p in pattern:
+            regex_dict["regex_string"] = p.get("value")
+
+    return regex_dict
+
+
 @streamfilter()
 def add_schema(event, schema: xmlschema.XMLSchema):
     """
@@ -71,9 +167,38 @@ def add_schema(event, schema: xmlschema.XMLSchema):
     path = "/".join(event.context)
     tag = event.context[-1]
     el = schema.get_element(tag, path)
+
     return event.from_event(event, path=path, schema=el)
 
 
+@streamfilter(check=type_check(events.TextNode), fail_function=pass_event)
+def add_schema_dict(event, schema_path: str):
+    schema_dict = None
+
+    config_type = event.schema.type.name
+    if config_type is not None:
+        if config_type[-4:] == "type":
+            schema_dict = _create_category_dict(config_type, schema_path)
+        if config_type in ["onedecimalplace", "twodecimalplaces", "ftetype"]:
+            schema_dict = _create_float_dict(config_type, schema_path)
+        if config_type in ["swetype"]:
+            schema_dict = _create_regex_dict(config_type, schema_path)
+        if config_type == "{http://www.w3.org/2001/XMLSchema}date":
+            schema_dict = {"date": "%Y-%m-%d"}
+        if config_type == "{http://www.w3.org/2001/XMLSchema}integer":
+            schema_dict = {"numeric": "integer"}
+        if config_type == "{http://www.w3.org/2001/XMLSchema}string":
+            schema_dict = {"string": "alphanumeric"}
+
+        if schema_dict is not None:
+            if event.schema.occurs[0] == 0:
+                schema_dict = {**schema_dict, **{"canbeblank": "yes"}}
+            elif event.schema.occurs[0] == 1:
+                schema_dict = {**schema_dict, **{"canbeblank": "no"}}
+
+    return event.from_event(event, schema_dict=schema_dict)
+
+
 def _get_validation_error(schema, node) -> XMLSchemaValidatorError:
     try:
         schema.validate(node)

diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py
@@ -1,9 +1,17 @@
 import os
 import pandas as pd
 import liiatools.datasets.social_work_workforce.SWFtools.util.work_path as work_path
+import liiatools.datasets.social_work_workforce.SWFtools.util.AppLogs as AppLogs
 
 
 def FTESum():
+    """
+    Calculate the sum of FTE by LEAName, YearCensus, SeniorityCode and SeniorityName from
+    the input csv file
+
+    :return: Excel file with the name FTESum_5d.xlsx and the same path as the input file
+    """
+
     # ===== Read file ===== #
     file = "CompMergSen.csv"
     requestPath = work_path.request
@@ -26,6 +34,13 @@ def FTESum():
 
 
 def FTESum_2020():
+    """
+    Read a CSV file and calculate the sum of FTE by LEAName, YearCensus, SeniorityCode and
+    SeniorityName for the year 2020
+
+    :return: Excel file with the name FTESum_2020.xlsx and the same path as the input file
+    """
+
     # ===== Read file ===== #
     file = "CompMergSen.csv"
     requestPath = work_path.request
@@ -34,14 +49,19 @@ def FTESum_2020():
 
     df2020 = df[df["YearCensus"] == 2020]
 
-    df5D = df2020[["LEAName", "YearCensus", "SeniorityCode", "SeniorityName", "FTE"]]
+    if df2020.empty:
+        AppLogs.log("FTESum_2020 error: No data for year 2020", console_output=True)
+    else:
+        df5D = df2020[
+            ["LEAName", "YearCensus", "SeniorityCode", "SeniorityName", "FTE"]
+        ]
 
-    df5D = df2020.groupby(
-        ["LEAName", "YearCensus", "SeniorityCode", "SeniorityName"]
-    ).agg(FTESum=("FTE", "sum"))
+        df5D = df2020.groupby(
+            ["LEAName", "YearCensus", "SeniorityCode", "SeniorityName"]
+        ).agg(FTESum=("FTE", "sum"))
 
-    # ===== Save and export file ===== #
-    fileOutN = "FTESum_2020.xlsx"
-    requestPath = work_path.request
-    fileOut = os.path.join(requestPath, fileOutN)
-    df5D.to_excel(fileOut, merge_cells=False)
+        # ===== Save and export file ===== #
+        fileOutN = "FTESum_2020.xlsx"
+        requestPath = work_path.request
+        fileOut = os.path.join(requestPath, fileOutN)
+        df5D.to_excel(fileOut, merge_cells=False)
diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py
@@ -5,9 +5,14 @@
 import os
 import pandas as pd
 import liiatools.datasets.social_work_workforce.SWFtools.util.work_path as work_path
+import liiatools.datasets.social_work_workforce.SWFtools.util.AppLogs as AppLogs
 
 
 def growth_tables():
+    """
+    Create two Excel files with tables of growth rates and population growth for six LEAs
+    """
+
     growth_rate_df = {
         "LEAName": [
             "Havering",
@@ -34,7 +39,7 @@ def growth_tables():
     fileOut = os.path.join(requestPath, fileOutN)
     growth_rate_table.to_excel(fileOut, index=False)
 
-    print("Auxiliary table: ", fileOutN, " Created")
+    AppLogs.log(f"Auxiliary table: {fileOutN} created", console_output=True)
 
     """
       Population growth table: 2020 to 2026
@@ -66,4 +71,4 @@ def growth_tables():
     fileOut = os.path.join(requestPath, fileOutN)
     population_growth_table.to_excel(fileOut, index=False)
 
-    print("Auxiliary table: ", fileOutN, " Created")
+    AppLogs.log(f"Auxiliary table: {fileOutN} created", console_output=True)