From 819e4fb47c2862a56a48e83c5a60cd620852529b Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 20 Jun 2023 08:14:17 +0000 Subject: [PATCH 01/40] Work in progress --- .../csww_main_functions.py | 94 +++++++++ .../lds_csww_clean/configuration.py | 80 ++++++++ .../lds_csww_clean/csww_record.py | 179 ++++++++++++++++++ .../lds_csww_clean/file_creator.py | 62 ++++++ .../lds_csww_clean/schema.py | 17 ++ 5 files changed, 432 insertions(+) create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/configuration.py create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index c68a681c..3184c26f 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -1,9 +1,35 @@ +import logging +import click_log +import click as click +from pathlib import Path +from datetime import datetime + from liiatools.datasets.social_work_workforce.sample_data import ( generate_sample_csww_file, ) from liiatools.csdatatools.util.stream import consume from liiatools.csdatatools.util.xml import etree, to_xml +# Dependencies for cleanfile() +from liiatools.csdatatools.util.xml import dom_parse +from liiatools.csdatatools.datasets.cincensus import filters +from liiatools.datasets.social_work_workforce.lds_csww_clean.schema import Schema + +from liiatools.datasets.social_work_workforce.lds_csww_clean import ( + file_creator, + configuration as clean_config, + csww_record +) + +from liiatools.datasets.shared_functions.common import ( + flip_dict, + check_file_type, + supported_file_types, + check_year, + check_year_within_range, + save_year_error, + save_incorrect_year_error +) def generate_sample(output: str): """ @@ -25,3 +51,71 @@ def generate_sample(output: str): FILE.write(element) except FileNotFoundError: print("The file path provided does not exist") + +def cleanfile(input, la_code, la_log_dir, output): + """ + Cleans input Children Social Work workforce xml files according to config and outputs cleaned csv files. + :param input: should specify the input file location, including file name and suffix, and be usable by a Path function + :param la_code: should be a three-letter string for the local authority depositing the file + :param la_log_dir: should specify the path to the local authority's log folder + :param output: should specify the path to the output folder + :return: None + """ + + # Open & Parse file + print("Starting # Open & Parse file") + if ( + check_file_type( + input, + file_types=[".xml"], + supported_file_types=supported_file_types, + la_log_dir=la_log_dir, + ) + == "incorrect file type" + ): + return + stream = dom_parse(input) + stream = list(stream) + + # Get year from input file + try: + filename = str(Path(input).resolve().stem) + input_year = check_year(filename) + except (AttributeError, ValueError): + save_year_error(input, la_log_dir) + return + + # Check year is within acceptable range for data retention policy + print("Starting # Check year") + years_to_go_back = 6 + year_start_month = 6 + reference_date = datetime.now() + if check_year_within_range(input_year, years_to_go_back, year_start_month, reference_date) is False: + save_incorrect_year_error(input, la_log_dir) + return + + # Configure stream + print("Starting # Configure stream") + config = clean_config.Config() + la_name = flip_dict(config["data_codes"])[la_code] + stream = filters.strip_text(stream) + stream = filters.add_context(stream) + stream = filters.add_schema(stream, schema=Schema(input_year).schema) + + # Output result + #print("Starting # Output result") + stream = csww_record.message_collector(stream) # <=== this is the problem - not returning any stream data + #print(f"Stream = {stream}") + data = csww_record.export_table(stream) + #print(f"Data = {data}") + data = file_creator.add_fields(input_year, data, la_name, la_code) + #print(data) + file_creator.export_file(input, output, data) + +cleanfile("/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", + "BAD", + "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean" + ) + +print("===> Finished running csww_main_functions.py") \ No newline at end of file diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/configuration.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/configuration.py new file mode 100644 index 00000000..8d3f291c --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/configuration.py @@ -0,0 +1,80 @@ +import datetime +import logging +import os +from pathlib import Path +import yaml +from string import Template + +from liiatools.spec import common as common_asset_dir + +log = logging.getLogger(__name__) + +COMMON_CONFIG_DIR = Path(common_asset_dir.__file__).parent + + +class Config(dict): + def __init__(self, *config_files): + super().__init__() + + if not config_files: + config_files = [ + "DEFAULT_DATA_CODES", + ] + + for file in config_files: + if file == "DEFAULT_DATA_CODES": + file = COMMON_CONFIG_DIR / "LA-codes.yml" + self.load_config(file, conditional=False) + + self["config_date"] = datetime.datetime.now().isoformat() + try: + self["username"] = os.getlogin() + except OSError: + # This happens when tests are not run under a login shell, e.g. CI pipeline + pass + + def load_config(self, filename, conditional=False, warn=False): + """ + Load configuration from yaml file. Any loaded configuration + is only set if the values don't already exist in CONFIG. + + Files can contain ${} placeholders following the Python string.Template format. + The context will include any keys already existing in the configuration, any keys + from the current file - however, if these include placeholders, the placeholders + will not be replaced. Finally, environment variables can be referenced with + `os_environ_VARIABLE_NAME`. + + Keyword arguments: + filename -- Filename to load from + conditional -- If True, ignore file if it doesn't exist. If False, fail. (default False) + """ + if conditional and not os.path.isfile(filename): + if warn: + log.warning("Missing optional file {}".format(filename)) + + return + + with open(filename) as FILE: + user_config = yaml.load(FILE, Loader=yaml.FullLoader) + + log.info( + "Loading {} configuration values from '{}'.".format( + len(user_config), filename + ) + ) + + environment_dict = {"os_environ_{}".format(k): v for k, v in os.environ.items()} + + variables = dict(self) + variables.update(user_config) + variables.update(environment_dict) + + with open(filename, "rt") as FILE: + user_config_string = FILE.read() + + user_config_template = Template(user_config_string) + user_config_string = user_config_template.substitute(variables) + + user_config = yaml.load(user_config_string, Loader=yaml.FullLoader) + + self.update(user_config) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py new file mode 100644 index 00000000..57e0b379 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py @@ -0,0 +1,179 @@ +from typing import Iterator +import tablib +from more_itertools import peekable + +from sfdata_stream_parser import events +from sfdata_stream_parser.collectors import xml_collector + + +class CSWWEvent(events.ParseEvent): + pass + + +class HeaderEvent(events.ParseEvent): + pass + + +def _reduce_dict(dict_instance): + new_dict = {} + for key, value in dict_instance.items(): + if len(value) == 1: + new_dict[key] = value[0] + else: + new_dict[key] = value + return new_dict + + +@xml_collector +def text_collector(stream): + data_dict = {} + current_element = None + for event in stream: + if isinstance(event, events.StartElement): + current_element = event.tag + if isinstance(event, events.TextNode) and event.text: + data_dict.setdefault(current_element, []).append(event.text) + return _reduce_dict(data_dict) + + +# @xml_collector +# def csww_collector(stream): +# data_dict = {} +# stream = peekable(stream) +# last_tag = None +# while stream: +# event = stream.peek() +# last_tag = event.get("tag", last_tag) +# if event.get("tag") in ( +# "Assessments", +# "CINPlanDates", +# "Section47", +# "ChildProtectionPlans", +# ): +# data_dict.setdefault(event.tag, []).append(text_collector(stream)) +# else: +# if isinstance(event, events.TextNode) and event.text: +# data_dict.setdefault(last_tag, []).append(event.text) +# next(stream) + +# return _reduce_dict(data_dict) + + +# @xml_collector +# def cswwworker_collector(stream): +# data_dict = {} +# stream = peekable(stream) +# assert stream.peek().tag == "CSWWWorker" +# while stream: +# event = stream.peek() +# print(f"Event tag = {event.get('tag')}") +# if event.get("tag") == "CSWWdetails": +# data_dict.setdefault(event.tag, []).append(text_collector(stream)) +# # elif event.get("tag") == "LALevelVacancies": +# # data_dict.setdefault(event.tag, []).append(csww_collector(stream)) +# else: +# next(stream) +# return _reduce_dict(data_dict) + + +@xml_collector +def message_collector(stream): + stream = peekable(stream) + assert stream.peek().tag == "Message", "Expected Message, got {}".format( + stream.peek().tag + ) + while stream: + event = stream.peek() + if event.get("tag") == "Header": + print(f"Header stream = {stream}") + header_record = text_collector(stream) + if header_record: + yield HeaderEvent(record=header_record) + elif event.get("tag") == "CSWWWorker": + csww_record = text_collector(stream) + if csww_record: + #print(f"yielding csww_record event: {CSWWEvent(record=csww_record)}") + #print(f"CSWWEvent(record=csww_record) = {CSWWEvent(record=csww_record)}") + yield CSWWEvent(record=csww_record) + else: + next(stream) + + +__EXPORT_HEADERS = [ + "AgencyWorker", + "SWENo", + "FTE", + "PersonBirthDate", + "GenderCurrent", + "Ethnicity", + "QualInst", + "StepUpGrad", + "RoleStartDate", + "StartOrigin", + "Cases30", + "WorkingDaysLost", + "ContractWeeks", + "FrontlineGrad", + "Absat30Sept", + "ReasonAbsence", + "CFKSSstatus", +] + + +def _maybe_list(value): + if value is None: + value = [] + if not isinstance(value, list): + value = [value] + print(f"maybe_list(value) = {value}") + return value + + +def csww_event(record, property, event_name=None): + if event_name is None: + event_name = property + value = record.get(property) + if value: + new_record = {**record, "Date": value, "Type": event_name} + return ({k: new_record.get(k) for k in __EXPORT_HEADERS},) + + return () + + +def event_to_records(event: CSWWEvent) -> Iterator[dict]: + record = event.record + #print(f"event.record = {record}") + # child = { + # **record.get("ChildIdentifiers", {}), + # **record.get("ChildCharacteristics", {}), + # } + # child["Disabilities"] = ",".join(_maybe_list(child.get("Disability"))) + + print(record.get("CSWWWorker")) + for csww_item in _maybe_list(record): + yield from csww_event({**child, **csww_item}, "CINreferralDate") + yield from csww_event({**child, **csww_item}, "CINclosureDate") + + for assessment in _maybe_list(csww_item.get("Assessments")): + assessment["Factors"] = ",".join( + _maybe_list(assessment.get("AssessmentFactors")) + ) + yield from csww_event( + {**child, **csww_item, **assessment}, "AssessmentActualStartDate" + ) + yield from csww_event( + {**child, **csww_item, **assessment}, "AssessmentAuthorisationDate" + ) + + +def export_table(stream): + #print(f"export_table() called for stream: {stream}") + data = tablib.Dataset(headers=__EXPORT_HEADERS) + #print(f"header data in export_table() = {data}") + for event in stream: + if isinstance(event, CSWWEvent): + for record in event_to_records(event): + #print("Found data to append") + data.append([record.get(k, "") for k in __EXPORT_HEADERS]) + else: print("No row data to append") + return data diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py new file mode 100644 index 00000000..d7200558 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py @@ -0,0 +1,62 @@ +from pathlib import Path +import pandas as pd +import logging + +from liiatools.datasets.shared_functions import converters, common + +log = logging.getLogger(__name__) + +def convert_to_dataframe(data): + data = data.export("df") + return data + + +def get_year(data, year): + data["YEAR"] = year + return data + +def convert_to_datetime(data): + data[["PersonBirthDate", "RoleStartDate"]] = data[ + ["PersonBirthDate", "RoleStartDate"] + ].apply(pd.to_datetime) + return data + + +def add_la_name(data, la_name): + data["LA"] = la_name + return data + + +# def la_prefix(data, la_code): +# data["LAchildID"] = data["LAchildID"] + "_" + la_code +# return data + + +def add_fields(input_year, data, la_name, la_code): + """ + Add YEAR, LA, PERSONSCHOOLYEAR to exported dataframe + Append LA_code from config to LAChildID + + :param input_year: A string of the year of return for the current file + :param data: The dataframe to be cleaned + :param la_name: LA name + :param la_code: LA code + :return: Cleaned and degraded dataframe + """ + data = convert_to_dataframe(data) + data = get_year(data, input_year) + data = convert_to_datetime(data) + #data = add_school_year(data) + data = add_la_name(data, la_name) + #data = la_prefix(data, la_code) + #data = degrade_dob(data) + #data = degrade_expected_dob(data) + #data = degrade_death_date(data) + return data + + +def export_file(input, output, data): + filename = Path(input).stem + outfile = filename + "_clean.csv" + output_path = Path(output, outfile) + data.to_csv(output_path, index=False) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py new file mode 100644 index 00000000..a8d67146 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py @@ -0,0 +1,17 @@ +from functools import cached_property +from pathlib import Path + +import xmlschema + +from liiatools.spec import social_work_workforce as csww_asset_dir + + +class Schema: + def __init__(self, year): + self.__year = year + + @cached_property + def schema(self) -> xmlschema.XMLSchema: + return xmlschema.XMLSchema( + Path(csww_asset_dir.__file__).parent / f"social_work_workforce_{self.__year}.xsd" + ) From f795e54f7bc8169c332b0dae715ea5471f35befc Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 21 Jun 2023 13:44:27 +0000 Subject: [PATCH 02/40] Fix to produce cleanfile output --- .../csww_main_functions.py | 8 --- .../lds_csww_clean/csww_record.py | 71 +------------------ 2 files changed, 2 insertions(+), 77 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 3184c26f..a381812e 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -63,7 +63,6 @@ def cleanfile(input, la_code, la_log_dir, output): """ # Open & Parse file - print("Starting # Open & Parse file") if ( check_file_type( input, @@ -86,7 +85,6 @@ def cleanfile(input, la_code, la_log_dir, output): return # Check year is within acceptable range for data retention policy - print("Starting # Check year") years_to_go_back = 6 year_start_month = 6 reference_date = datetime.now() @@ -95,21 +93,15 @@ def cleanfile(input, la_code, la_log_dir, output): return # Configure stream - print("Starting # Configure stream") config = clean_config.Config() la_name = flip_dict(config["data_codes"])[la_code] stream = filters.strip_text(stream) stream = filters.add_context(stream) stream = filters.add_schema(stream, schema=Schema(input_year).schema) - # Output result - #print("Starting # Output result") stream = csww_record.message_collector(stream) # <=== this is the problem - not returning any stream data - #print(f"Stream = {stream}") data = csww_record.export_table(stream) - #print(f"Data = {data}") data = file_creator.add_fields(input_year, data, la_name, la_code) - #print(data) file_creator.export_file(input, output, data) cleanfile("/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py index 57e0b379..b0731819 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py @@ -36,46 +36,6 @@ def text_collector(stream): return _reduce_dict(data_dict) -# @xml_collector -# def csww_collector(stream): -# data_dict = {} -# stream = peekable(stream) -# last_tag = None -# while stream: -# event = stream.peek() -# last_tag = event.get("tag", last_tag) -# if event.get("tag") in ( -# "Assessments", -# "CINPlanDates", -# "Section47", -# "ChildProtectionPlans", -# ): -# data_dict.setdefault(event.tag, []).append(text_collector(stream)) -# else: -# if isinstance(event, events.TextNode) and event.text: -# data_dict.setdefault(last_tag, []).append(event.text) -# next(stream) - -# return _reduce_dict(data_dict) - - -# @xml_collector -# def cswwworker_collector(stream): -# data_dict = {} -# stream = peekable(stream) -# assert stream.peek().tag == "CSWWWorker" -# while stream: -# event = stream.peek() -# print(f"Event tag = {event.get('tag')}") -# if event.get("tag") == "CSWWdetails": -# data_dict.setdefault(event.tag, []).append(text_collector(stream)) -# # elif event.get("tag") == "LALevelVacancies": -# # data_dict.setdefault(event.tag, []).append(csww_collector(stream)) -# else: -# next(stream) -# return _reduce_dict(data_dict) - - @xml_collector def message_collector(stream): stream = peekable(stream) @@ -85,15 +45,12 @@ def message_collector(stream): while stream: event = stream.peek() if event.get("tag") == "Header": - print(f"Header stream = {stream}") header_record = text_collector(stream) if header_record: yield HeaderEvent(record=header_record) elif event.get("tag") == "CSWWWorker": csww_record = text_collector(stream) if csww_record: - #print(f"yielding csww_record event: {CSWWEvent(record=csww_record)}") - #print(f"CSWWEvent(record=csww_record) = {CSWWEvent(record=csww_record)}") yield CSWWEvent(record=csww_record) else: next(stream) @@ -125,7 +82,6 @@ def _maybe_list(value): value = [] if not isinstance(value, list): value = [value] - print(f"maybe_list(value) = {value}") return value @@ -142,38 +98,15 @@ def csww_event(record, property, event_name=None): def event_to_records(event: CSWWEvent) -> Iterator[dict]: record = event.record - #print(f"event.record = {record}") - # child = { - # **record.get("ChildIdentifiers", {}), - # **record.get("ChildCharacteristics", {}), - # } - # child["Disabilities"] = ",".join(_maybe_list(child.get("Disability"))) - - print(record.get("CSWWWorker")) - for csww_item in _maybe_list(record): - yield from csww_event({**child, **csww_item}, "CINreferralDate") - yield from csww_event({**child, **csww_item}, "CINclosureDate") - for assessment in _maybe_list(csww_item.get("Assessments")): - assessment["Factors"] = ",".join( - _maybe_list(assessment.get("AssessmentFactors")) - ) - yield from csww_event( - {**child, **csww_item, **assessment}, "AssessmentActualStartDate" - ) - yield from csww_event( - {**child, **csww_item, **assessment}, "AssessmentAuthorisationDate" - ) + for csww_item in _maybe_list(record): + yield from csww_event({**csww_item}, "StepUpGrad") def export_table(stream): - #print(f"export_table() called for stream: {stream}") data = tablib.Dataset(headers=__EXPORT_HEADERS) - #print(f"header data in export_table() = {data}") for event in stream: if isinstance(event, CSWWEvent): for record in event_to_records(event): - #print("Found data to append") data.append([record.get(k, "") for k in __EXPORT_HEADERS]) - else: print("No row data to append") return data From 8d3bd6c60da4959460e395bb894f3dd81527fbfa Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 21 Jun 2023 14:00:10 +0000 Subject: [PATCH 03/40] Run python black --- .../csww_main_functions.py | 31 +++++++++++++------ .../lds_csww_clean/csww_record.py | 2 +- .../lds_csww_clean/file_creator.py | 12 ++++--- .../lds_csww_clean/schema.py | 3 +- 4 files changed, 31 insertions(+), 17 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index a381812e..30290cfb 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -18,7 +18,7 @@ from liiatools.datasets.social_work_workforce.lds_csww_clean import ( file_creator, configuration as clean_config, - csww_record + csww_record, ) from liiatools.datasets.shared_functions.common import ( @@ -28,9 +28,10 @@ check_year, check_year_within_range, save_year_error, - save_incorrect_year_error + save_incorrect_year_error, ) + def generate_sample(output: str): """ Export a sample file for testing @@ -52,6 +53,7 @@ def generate_sample(output: str): except FileNotFoundError: print("The file path provided does not exist") + def cleanfile(input, la_code, la_log_dir, output): """ Cleans input Children Social Work workforce xml files according to config and outputs cleaned csv files. @@ -88,7 +90,12 @@ def cleanfile(input, la_code, la_log_dir, output): years_to_go_back = 6 year_start_month = 6 reference_date = datetime.now() - if check_year_within_range(input_year, years_to_go_back, year_start_month, reference_date) is False: + if ( + check_year_within_range( + input_year, years_to_go_back, year_start_month, reference_date + ) + is False + ): save_incorrect_year_error(input, la_log_dir) return @@ -99,15 +106,19 @@ def cleanfile(input, la_code, la_log_dir, output): stream = filters.add_context(stream) stream = filters.add_schema(stream, schema=Schema(input_year).schema) # Output result - stream = csww_record.message_collector(stream) # <=== this is the problem - not returning any stream data + stream = csww_record.message_collector( + stream + ) # <=== this is the problem - not returning any stream data data = csww_record.export_table(stream) data = file_creator.add_fields(input_year, data, la_name, la_code) file_creator.export_file(input, output, data) -cleanfile("/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", - "BAD", - "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean" - ) -print("===> Finished running csww_main_functions.py") \ No newline at end of file +cleanfile( + "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", + "BAD", + "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +) + +print("===> Finished running csww_main_functions.py") diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py index b0731819..1a3c1b83 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py @@ -59,7 +59,7 @@ def message_collector(stream): __EXPORT_HEADERS = [ "AgencyWorker", "SWENo", - "FTE", + "FTE", "PersonBirthDate", "GenderCurrent", "Ethnicity", diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py index d7200558..d951c358 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py @@ -6,6 +6,7 @@ log = logging.getLogger(__name__) + def convert_to_dataframe(data): data = data.export("df") return data @@ -15,6 +16,7 @@ def get_year(data, year): data["YEAR"] = year return data + def convert_to_datetime(data): data[["PersonBirthDate", "RoleStartDate"]] = data[ ["PersonBirthDate", "RoleStartDate"] @@ -46,12 +48,12 @@ def add_fields(input_year, data, la_name, la_code): data = convert_to_dataframe(data) data = get_year(data, input_year) data = convert_to_datetime(data) - #data = add_school_year(data) + # data = add_school_year(data) data = add_la_name(data, la_name) - #data = la_prefix(data, la_code) - #data = degrade_dob(data) - #data = degrade_expected_dob(data) - #data = degrade_death_date(data) + # data = la_prefix(data, la_code) + # data = degrade_dob(data) + # data = degrade_expected_dob(data) + # data = degrade_death_date(data) return data diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py index a8d67146..f82b5eb1 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py @@ -13,5 +13,6 @@ def __init__(self, year): @cached_property def schema(self) -> xmlschema.XMLSchema: return xmlschema.XMLSchema( - Path(csww_asset_dir.__file__).parent / f"social_work_workforce_{self.__year}.xsd" + Path(csww_asset_dir.__file__).parent + / f"social_work_workforce_{self.__year}.xsd" ) From ac82a0ca801978f291094bb55b6d841c35b0d643 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 3 Jul 2023 11:48:05 +0000 Subject: [PATCH 04/40] Minor changes --- liiatools/datasets/social_work_workforce/csww_main_functions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 30290cfb..970638d2 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -120,5 +120,3 @@ def cleanfile(input, la_code, la_log_dir, output): "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", ) - -print("===> Finished running csww_main_functions.py") From 333617e85e01837191a2cdbf02c0745ec685f16d Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 3 Jul 2023 11:49:10 +0000 Subject: [PATCH 05/40] Sort imports --- .../social_work_workforce/csww_main_functions.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 970638d2..f10e0596 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -1,7 +1,7 @@ -import logging -import click_log -import click as click +# import logging +# import click_log from pathlib import Path +import click as click from datetime import datetime from liiatools.datasets.social_work_workforce.sample_data import ( @@ -105,10 +105,11 @@ def cleanfile(input, la_code, la_log_dir, output): stream = filters.strip_text(stream) stream = filters.add_context(stream) stream = filters.add_schema(stream, schema=Schema(input_year).schema) + # Output result stream = csww_record.message_collector( stream - ) # <=== this is the problem - not returning any stream data + ) data = csww_record.export_table(stream) data = file_creator.add_fields(input_year, data, la_name, la_code) file_creator.export_file(input, output, data) From 0a077e2e13b46c44d3e6ec707c7df342cb57e2ef Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 4 Jul 2023 10:48:45 +0000 Subject: [PATCH 06/40] Create separate csv clean file for LA level data --- .../csww_main_functions.py | 24 ++++---- .../lds_csww_clean/csww_record.py | 58 +++++++++++++++---- .../lds_csww_clean/file_creator.py | 26 ++++++--- 3 files changed, 76 insertions(+), 32 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index f10e0596..8e41ade8 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -1,7 +1,4 @@ -# import logging -# import click_log from pathlib import Path -import click as click from datetime import datetime from liiatools.datasets.social_work_workforce.sample_data import ( @@ -87,8 +84,8 @@ def cleanfile(input, la_code, la_log_dir, output): return # Check year is within acceptable range for data retention policy - years_to_go_back = 6 - year_start_month = 6 + years_to_go_back = 7 + year_start_month = 1 reference_date = datetime.now() if ( check_year_within_range( @@ -105,14 +102,15 @@ def cleanfile(input, la_code, la_log_dir, output): stream = filters.strip_text(stream) stream = filters.add_context(stream) stream = filters.add_schema(stream, schema=Schema(input_year).schema) - - # Output result - stream = csww_record.message_collector( - stream - ) - data = csww_record.export_table(stream) - data = file_creator.add_fields(input_year, data, la_name, la_code) - file_creator.export_file(input, output, data) + + # Output results + stream = csww_record.message_collector(stream) + + data_worker, data_lalevel = csww_record.export_table(stream) + data_worker = file_creator.add_fields(input_year, data_worker, la_name) + data_lalevel = file_creator.add_fields(input_year, data_lalevel, la_name) + file_creator.export_file(input, output, data_lalevel, "lalevel") + file_creator.export_file(input, output, data_worker, "worker") cleanfile( diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py index 1a3c1b83..07414065 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py @@ -10,6 +10,10 @@ class CSWWEvent(events.ParseEvent): pass +class LALevelEvent(events.ParseEvent): + pass + + class HeaderEvent(events.ParseEvent): pass @@ -52,11 +56,15 @@ def message_collector(stream): csww_record = text_collector(stream) if csww_record: yield CSWWEvent(record=csww_record) + elif event.get("tag") == "LALevelVacancies": + lalevel_record = text_collector(stream) + if lalevel_record: + yield LALevelEvent(record=lalevel_record) else: next(stream) -__EXPORT_HEADERS = [ +__EXPORT_HEADERS_CSWWWORKER = [ "AgencyWorker", "SWENo", "FTE", @@ -76,6 +84,12 @@ def message_collector(stream): "CFKSSstatus", ] +__EXPORT_HEADERS_LALEVELVAC = [ + "NumberOfVacancies", + "NoAgencyFTE", + "NoAgencyHeadcount", +] + def _maybe_list(value): if value is None: @@ -85,28 +99,52 @@ def _maybe_list(value): return value -def csww_event(record, property, event_name=None): +def csww_event_worker(record, property, event_name=None): if event_name is None: event_name = property value = record.get(property) if value: new_record = {**record, "Date": value, "Type": event_name} - return ({k: new_record.get(k) for k in __EXPORT_HEADERS},) + return ({k: new_record.get(k) for k in __EXPORT_HEADERS_CSWWWORKER},) return () -def event_to_records(event: CSWWEvent) -> Iterator[dict]: - record = event.record +def lalevel_event(record, property, event_name=None): + if event_name is None: + event_name = property + value = record.get(property) + if value: + new_record = {**record, "Date": value, "Type": event_name} + return ({k: new_record.get(k) for k in __EXPORT_HEADERS_LALEVELVAC},) + + return () + +def event_to_records_worker(event: CSWWEvent) -> Iterator[dict]: + record = event.record for csww_item in _maybe_list(record): - yield from csww_event({**csww_item}, "StepUpGrad") + yield from csww_event_worker({**csww_item}, "StepUpGrad") + + +def event_to_records_lalevel(event: LALevelEvent) -> Iterator[dict]: + record = event.record + for lalevel_item in _maybe_list(record): + yield from lalevel_event({**lalevel_item}, "NoAgencyFTE") def export_table(stream): - data = tablib.Dataset(headers=__EXPORT_HEADERS) + data_worker = tablib.Dataset(headers=__EXPORT_HEADERS_CSWWWORKER) + data_lalevel = tablib.Dataset(headers=__EXPORT_HEADERS_LALEVELVAC) for event in stream: if isinstance(event, CSWWEvent): - for record in event_to_records(event): - data.append([record.get(k, "") for k in __EXPORT_HEADERS]) - return data + for record in event_to_records_worker(event): + data_worker.append( + [record.get(k, "") for k in __EXPORT_HEADERS_CSWWWORKER] + ) + elif isinstance(event, LALevelEvent): + for record in event_to_records_lalevel(event): + data_lalevel.append( + [record.get(k, "") for k in __EXPORT_HEADERS_LALEVELVAC] + ) + return data_worker, data_lalevel diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py index d951c358..fa4d841b 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py @@ -34,22 +34,20 @@ def add_la_name(data, la_name): # return data -def add_fields(input_year, data, la_name, la_code): +def add_fields(input_year, data, la_name): """ - Add YEAR, LA, PERSONSCHOOLYEAR to exported dataframe - Append LA_code from config to LAChildID + Add YEAR, LA to exported dataframe :param input_year: A string of the year of return for the current file :param data: The dataframe to be cleaned :param la_name: LA name - :param la_code: LA code :return: Cleaned and degraded dataframe """ data = convert_to_dataframe(data) data = get_year(data, input_year) - data = convert_to_datetime(data) - # data = add_school_year(data) data = add_la_name(data, la_name) + + # data = convert_to_datetime(data) # data = la_prefix(data, la_code) # data = degrade_dob(data) # data = degrade_expected_dob(data) @@ -57,8 +55,18 @@ def add_fields(input_year, data, la_name, la_code): return data -def export_file(input, output, data): - filename = Path(input).stem - outfile = filename + "_clean.csv" +def export_file(input, output, data, filenamelevel): + """ + Output cleansed and degraded dataframe as csv file. + Example of output filename: social_work_workforce_2022_lalevel_clean.csv + + :param input: should specify the input file location, including file name and suffix, and be usable by a Path function + :param output: should specify the path to the output folder + :param data: The cleansed dataframe to be output + :param filenamelevel: String appended to output filename indicating aggregation level - worker or LA level + :return: csv file containing cleaned and degraded dataframe + """ + filenamestem = Path(input).stem + outfile = filenamestem + "_" + filenamelevel + "_clean.csv" output_path = Path(output, outfile) data.to_csv(output_path, index=False) From 88a2f2953402b36331580a1260ebd8df8b5c5c74 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 4 Jul 2023 12:44:11 +0000 Subject: [PATCH 07/40] Simplify event_to_records --- .../lds_csww_clean/csww_record.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py index 07414065..c2f8fdd3 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py @@ -121,16 +121,10 @@ def lalevel_event(record, property, event_name=None): return () -def event_to_records_worker(event: CSWWEvent) -> Iterator[dict]: +def event_to_records(event) -> Iterator[dict]: record = event.record - for csww_item in _maybe_list(record): - yield from csww_event_worker({**csww_item}, "StepUpGrad") - - -def event_to_records_lalevel(event: LALevelEvent) -> Iterator[dict]: - record = event.record - for lalevel_item in _maybe_list(record): - yield from lalevel_event({**lalevel_item}, "NoAgencyFTE") + for item in _maybe_list(record): + yield from (item,) def export_table(stream): @@ -138,12 +132,12 @@ def export_table(stream): data_lalevel = tablib.Dataset(headers=__EXPORT_HEADERS_LALEVELVAC) for event in stream: if isinstance(event, CSWWEvent): - for record in event_to_records_worker(event): + for record in event_to_records(event): data_worker.append( [record.get(k, "") for k in __EXPORT_HEADERS_CSWWWORKER] ) elif isinstance(event, LALevelEvent): - for record in event_to_records_lalevel(event): + for record in event_to_records(event): data_lalevel.append( [record.get(k, "") for k in __EXPORT_HEADERS_LALEVELVAC] ) From 7e5102216ba6db9583ad24576c4137f867ba8b33 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 4 Jul 2023 12:50:14 +0000 Subject: [PATCH 08/40] Remove unnecessary event functions --- .../lds_csww_clean/csww_record.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py index c2f8fdd3..7eea634b 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py @@ -99,28 +99,6 @@ def _maybe_list(value): return value -def csww_event_worker(record, property, event_name=None): - if event_name is None: - event_name = property - value = record.get(property) - if value: - new_record = {**record, "Date": value, "Type": event_name} - return ({k: new_record.get(k) for k in __EXPORT_HEADERS_CSWWWORKER},) - - return () - - -def lalevel_event(record, property, event_name=None): - if event_name is None: - event_name = property - value = record.get(property) - if value: - new_record = {**record, "Date": value, "Type": event_name} - return ({k: new_record.get(k) for k in __EXPORT_HEADERS_LALEVELVAC},) - - return () - - def event_to_records(event) -> Iterator[dict]: record = event.record for item in _maybe_list(record): From db9cd7d31d853c530e47bbc0f29c5e161df752d5 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 4 Jul 2023 15:09:34 +0000 Subject: [PATCH 09/40] Degrade dob and SWENo --- .../lds_csww_clean/file_creator.py | 57 +++++++++++++++---- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py index fa4d841b..3f383dcd 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py @@ -1,6 +1,9 @@ from pathlib import Path import pandas as pd import logging +import hashlib +from typing import Dict +from decouple import config from liiatools.datasets.shared_functions import converters, common @@ -18,9 +21,10 @@ def get_year(data, year): def convert_to_datetime(data): - data[["PersonBirthDate", "RoleStartDate"]] = data[ - ["PersonBirthDate", "RoleStartDate"] - ].apply(pd.to_datetime) + if set(["PersonBirthDate", "RoleStartDate"]).issubset(data): + data[["PersonBirthDate", "RoleStartDate"]] = data[ + ["PersonBirthDate", "RoleStartDate"] + ].apply(pd.to_datetime) return data @@ -29,9 +33,42 @@ def add_la_name(data, la_name): return data -# def la_prefix(data, la_code): -# data["LAchildID"] = data["LAchildID"] + "_" + la_code -# return data +def degrade_dob(data): + if "PersonBirthDate" in data: + if data["PersonBirthDate"] is not None: + data["PersonBirthDate"] = data["PersonBirthDate"].apply( + lambda row: converters.to_month_only_dob(row) + ) + return data + + +def degrade_SWENo(data): + """ + Replaces SWE number with hashed version + """ + if "SWENo" in data: + if data["SWENo"] is not None: + data["SWENo"] = data["SWENo"].apply(lambda row: swe_hash(row)) + return data + + +def swe_hash(swe_num): + """ + Converts the **SWENo** field to a hash code represented in HEX + :param swe_num: SWE number to be converted + :return: Hash code version of SWE number + """ + + private_string = config("sec_str", default="") + + private_key = swe_num + private_string + + # Preparing plain text (SWENo) to hash it + plaintext = private_key.encode() + + hash_algorithm = hashlib.sha3_256(plaintext) + + return hash_algorithm.hexdigest() def add_fields(input_year, data, la_name): @@ -47,11 +84,9 @@ def add_fields(input_year, data, la_name): data = get_year(data, input_year) data = add_la_name(data, la_name) - # data = convert_to_datetime(data) - # data = la_prefix(data, la_code) - # data = degrade_dob(data) - # data = degrade_expected_dob(data) - # data = degrade_death_date(data) + data = convert_to_datetime(data) + data = degrade_dob(data) + data = degrade_SWENo(data) return data From 57a90922163c2c45c42eb16a15edcd1f5ee663db Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 5 Jul 2023 10:38:40 +0000 Subject: [PATCH 10/40] Create additional sample files for testing la_agg --- .../csww/BAD/social_work_workforce_2021.xml | 556 ++++++++++++++++++ .../csww/BAD/social_work_workforce_2022.xml | 4 +- .../social_work_workforce_2021.xsd | 254 ++++++++ 3 files changed, 812 insertions(+), 2 deletions(-) create mode 100644 liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml create mode 100644 liiatools/spec/social_work_workforce/social_work_workforce_2021.xsd diff --git a/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml b/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml new file mode 100644 index 00000000..8ef1b9ef --- /dev/null +++ b/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml @@ -0,0 +1,556 @@ + +
+ + CSWW + 2021 + 2021-09-30 + + + L + 301 + liiatools.datasets.social_work_workforce.sample_data + 2023-03-28T14:54:55Z + +
+ + 66.66 + 40.40 + 100 + + + 1 + Ox2054309383 + 0.521371 + 1969-01-22 + 1 + REFU + Institution Name + 0 + 1988-04-07 + 9 + 72 + 15.31 + 288.7 + 1 + 0 + TRN + 1 + + + 1 + Yk7226043359 + 0 + 1958-04-07 + 9 + Institution Name + 1 + 2 + 8 + 2019-09-23 + 7 + 10 + 0.603665 + 66 + 29.87 + 2.5 + 1 + 4 + + + 1 + iP8098309864 + 0 + 1984-01-12 + APKN + Institution Name + 0 + 5 + 2014-01-26 + 4 + 2023-03-28 + 3 + 9 + 0.23246 + 92.56 + 213.4 + 0 + 2 + + + 0 + oP8178849586 + 0.899676 + 1990-09-28 + 9 + BAFR + 1 + 0 + 6 + 2023-03-28 + 2 + 0.429963 + 14.39 + 0 + + + 1 + nH9419631053 + 0.133587 + 2 + AIND + Institution Name + 3 + 1 + 2017-06-10 + 0.436348 + 5.39 + 475.7 + 1 + UNP + 1 + + + 1 + JJ3661684122 + 0 + 1993-05-19 + 9 + 3 + 2020-06-14 + 3 + 2023-03-28 + 1 + 5 + 0.903669 + 11 + 141.0 + 1 + + + 1 + tN2120744892 + 0.803122 + WBRI + Institution Name + 3 + 0.964327 + 95.06 + 403.6 + 0 + 2 + + + 1 + Zo9779760045 + 0.767688 + 1996-08-31 + 0 + MWAS + 2 + 1 + 5 + 2023-03-28 + 62 + 1 + 2 + + + 0 + wf3752370095 + 0.843488 + 1959-04-17 + 2 + APKN + Institution Name + 3 + 0 + 1997-10-01 + 2 + 0.712824 + 16.74 + 456.3 + 0 + 4 + + + 1 + OW2475789301 + 0 + 1971-10-02 + Institution Name + 1 + 1 + 1993-10-04 + 2023-03-28 + 3 + 6 + 0.908092 + 45 + 22.98 + 441.5 + 0 + 3 + + + 1 + Kv3016593719 + 0.12232 + 1996-06-05 + 1 + BAFR + 2 + 6 + 10 + 0.641824 + 23 + 36.13 + 213.1 + 0 + + + 0 + TB9669555723 + 0 + 1987-10-30 + 1 + 0 + 2 + 2012-10-02 + 2 + 2023-03-28 + 6 + 7 + 37 + 90.85 + 28.5 + 1 + 1 + UNA + 1 + + + 1 + QK8499162867 + 0 + 1968-11-27 + ABAN + 2 + 0 + 6 + 2018-08-03 + 9 + 2023-03-28 + 2 + 9 + 0.078464 + 43.02 + 154.7 + 1 + 1 + + + 0 + Wr5514040878 + 0 + 0 + AOTH + Institution Name + 1 + 0 + 2 + 2015-04-24 + 9 + 2023-03-28 + 1 + 6 + 3.51 + 424.0 + 2 + + + 0 + Aj9242652291 + 0.859218 + 1968-12-31 + 0 + BCRB + Institution Name + 1 + 1 + 1 + 2003-09-12 + 5 + 0.320526 + 85 + 98.22 + 206.6 + 2 + + + 0 + Jv2635496195 + 0.021911 + 1977-06-27 + REFU + Institution Name + 1 + 6 + 2022-10-08 + 6 + 0.69819 + 25 + 29.19 + 1 + 1 + SIC + + + 1 + To5555885076 + 0.786453 + 1996-11-18 + 0 + MWAS + Institution Name + 3 + 0 + 4 + 2023-03-28 + 1 + 0.441344 + 83 + 78.29 + 364.4 + 1 + + + 0 + rK9218104079 + 0.491425 + 1998-04-15 + 1 + 3 + 0 + 2023-03-28 + 4 + 0.939826 + 3.1 + 415.3 + 4 + + + 1 + cD9282390165 + 0.192894 + 1959-09-25 + 0 + REFU + Institution Name + 2 + 1 + 3 + 1985-12-12 + 9 + 0.18449 + 14 + 188.4 + 0 + 1 + + + 0 + zU6140515687 + 0 + 1962-11-04 + WBRI + 3 + 1 + 3 + 1999-07-14 + 2018-08-20 + 8 + 3 + 0.222573 + 65 + 16.26 + 1 + + + 1 + ih3342923522 + 0.862474 + 1992-02-18 + 0 + WBRI + 3 + 1 + 2023-03-28 + 4 + 0.761443 + 39 + 0 + 2 + + + 1 + cm3809724991 + 0 + 2001-10-29 + 1 + AIND + 3 + 0 + 1 + 2023-03-28 + 5 + 4 + 0.530908 + 29 + 38.71 + 339.9 + 0 + + + 1 + PA8564166424 + 0.668266 + 1983-04-13 + 9 + Institution Name + 1 + 1 + 2023-03-28 + 3 + 0.707445 + 1 + 2 + + + 0 + QW8564363911 + 0.978729 + 1958-04-26 + 9 + MWBA + Institution Name + 1 + 2002-01-31 + 1 + 0.698641 + 121.9 + 1 + 3 + + + 1 + PQ5842914246 + 0 + 1989-06-05 + 1 + 1 + 1 + 2011-08-31 + 5 + 2023-03-28 + 7 + 9 + 0.443976 + 70 + 12.2 + 301.3 + 0 + 4 + + + 0 + ZQ9393137749 + 0 + 1981-09-21 + CHNE + 1 + 1 + 2001-02-10 + 6 + 2023-03-28 + 4 + 1 + 0.821627 + 94.67 + 471.5 + 0 + 2 + + + 1 + Pv9093835426 + 0.561974 + OOTH + Institution Name + 2 + 0 + 6 + 2014-09-30 + 4 + 0.965936 + 63 + 87.59 + 0 + SIC + 1 + + + 0 + eW7601111729 + 0 + Institution Name + 3 + 0 + 1 + 1993-04-18 + 3 + 2023-03-28 + 7 + 1 + 0.63075 + 80 + 299.1 + 0 + 4 + + + 0 + Jd1465867330 + 0.034436 + 2 + APKN + Institution Name + 1 + 1997-01-11 + 4 + 0.22182 + 23 + 83.01 + 0 + 3 + + + 1 + od1620971821 + 0 + 1975-01-19 + 9 + WOTH + Institution Name + 2 + 0 + 1 + 2016-08-20 + 9 + 2023-03-28 + 5 + 9 + 87 + 13.01 + 1 + +
\ No newline at end of file diff --git a/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml b/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml index 8da0fed3..60a8164c 100644 --- a/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml +++ b/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml @@ -2,8 +2,8 @@
CSWW - 2020 - 2023-03-28 + 2022 + 2022-09-30 L diff --git a/liiatools/spec/social_work_workforce/social_work_workforce_2021.xsd b/liiatools/spec/social_work_workforce/social_work_workforce_2021.xsd new file mode 100644 index 00000000..bc9f98a3 --- /dev/null +++ b/liiatools/spec/social_work_workforce/social_work_workforce_2021.xsd @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Not an Agency Worker + Agency Worker + + + + + + + + + + + + + + + + + + + + Not Known + Male + Female + Not Specified + + + + + + White British + White Irish + Any Other White Background + White and Black Caribbean + White and Black African + White and Asian + Any Other Mixed Background + Indian + Pakistani + Bangladeshi + Any Other Asian Background + Black Caribbean + Black African + Any Other Black Background + Chinese + Any Other Ethnic Group + Declared not stated or Refused + Information Not Yet Obtained + + + + + + Under-graduate + Post-graduate + Other + + + + + + No + Yes + + + + + + Senior Manager + Middle Manager + First Line Manager + Senior Practicioner + Case Holder + Qualified Without Cases + + + + + + Newly Qualified Social Workers + Social Worker Role in a Different Local Authority in England + Social Worker Role Outside England + Agency or Consultancy in Social Work (in England) + Other Social Work Role Non-local Authority (in England) + Other Social Care Role in Local Authority/Non-local Authority (in England) + Non-social Care Role/Any Role Outside England/No Employment/Career Break + Other + Not Known + Not Yet Collected + + + + + + Social Worker Role in a Different Local Authority in England + Social Worker Role Outside England + Agency or Consultancy in Social Work (in England) + Other Social Work Role Non-local Authority (in England) + Other Social Care Role in Local Authority/Non-local Authority (in England) + Non-social Care Role/Any Role Outside England/No Employment/Career Break + Other + Not Known + Not Yet Collected + + + + + + Resignation + Voluntary Redundancy + Compulsory Redundancy + Dismissed + Retired + Deceased + Moved to a Non-child and Family Social Work Role Within LA + Other + Not Known + Not Yet Collected + + + + + + Maternity/Paternity leave + Other Paid Authorised Absence, Such As: Compassionate Leave, Annual Leave Requiring Reallocation Of Cases + Paid Absence For Public Duties, Such As: Jury Duty + Sick Leave + Training + Unauthorised Absence + Unpaid Authorised Absence + + + + + + Assessed and Supported Year in Employment (AYSE) + Frontline Practitioner + Practice Supervisor + Practice Leader + + + + \ No newline at end of file From 3abf03c7cf45a3c6bdd9065859b4699a4e56975a Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 5 Jul 2023 10:47:51 +0000 Subject: [PATCH 11/40] Add la_agg functionality to workforce dataset --- .../social_work_workforce/csww_cli.py | 25 +++++ .../csww_main_functions.py | 91 +++++++++++++++-- .../lds_csww_la_agg/configuration.py | 75 ++++++++++++++ .../lds_csww_la_agg/process.py | 97 +++++++++++++++++++ .../spec/social_work_workforce/la-agg.yml | 67 +++++++++++++ 5 files changed, 347 insertions(+), 8 deletions(-) create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_la_agg/configuration.py create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_la_agg/process.py create mode 100644 liiatools/spec/social_work_workforce/la-agg.yml diff --git a/liiatools/datasets/social_work_workforce/csww_cli.py b/liiatools/datasets/social_work_workforce/csww_cli.py index 406ce1c5..0b687767 100644 --- a/liiatools/datasets/social_work_workforce/csww_cli.py +++ b/liiatools/datasets/social_work_workforce/csww_cli.py @@ -35,3 +35,28 @@ def generate_sample(output: str): """ output = csww_main_functions.generate_sample(output) return output + + +@csww.command() +@click.option( + "--i", + "input", + required=True, + type=str, + help="A string specifying the input file location, including the file name and suffix, usable by a pathlib Path function", +) +@click.option( + "--o", + "output", + required=True, + type=str, + help="A string specifying the output directory location", +) +def la_agg(input, output): + """ + Joins data from newly cleaned CSWW files (output of cleanfile()) to existing CSWW files data for the depositing local authority + :param input: should specify the input file location, including file name and suffix, and be usable by a Path function + :param output: should specify the path to the output folder + :return: None + """ + csww_main_functions.la_agg(input, output) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 8e41ade8..ced675fa 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -1,5 +1,6 @@ from pathlib import Path from datetime import datetime +import yaml from liiatools.datasets.social_work_workforce.sample_data import ( generate_sample_csww_file, @@ -18,6 +19,7 @@ csww_record, ) +from liiatools.spec import common as common_asset_dir from liiatools.datasets.shared_functions.common import ( flip_dict, check_file_type, @@ -28,6 +30,29 @@ save_incorrect_year_error, ) +# dependencies for la_agg() +from liiatools.datasets.social_work_workforce.lds_csww_la_agg import ( + configuration as agg_config, +) +from liiatools.datasets.social_work_workforce.lds_csww_la_agg import ( + process as agg_process, +) + +# dependencies for pan_agg() +# from liiatools.datasets.social_work_workforce.lds_csww_pan_agg import configuration as pan_config +# from liiatools.datasets.social_work_workforce.lds_csww_pan_agg import process as pan_process + + +COMMON_CONFIG_DIR = Path(common_asset_dir.__file__).parent +# Get all the possible LA codes that could be used +with open(f"{COMMON_CONFIG_DIR}/LA-codes.yml") as las: + la_list = list(yaml.full_load(las)["data_codes"].values()) + +# Set constants for data retention period +YEARS_TO_GO_BACK = 7 +YEAR_START_MONTH = 1 +REFERENCE_DATE = datetime.now() + def generate_sample(output: str): """ @@ -84,12 +109,9 @@ def cleanfile(input, la_code, la_log_dir, output): return # Check year is within acceptable range for data retention policy - years_to_go_back = 7 - year_start_month = 1 - reference_date = datetime.now() if ( check_year_within_range( - input_year, years_to_go_back, year_start_month, reference_date + input_year, YEARS_TO_GO_BACK, YEAR_START_MONTH, REFERENCE_DATE ) is False ): @@ -113,9 +135,62 @@ def cleanfile(input, la_code, la_log_dir, output): file_creator.export_file(input, output, data_worker, "worker") -cleanfile( - "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", - "BAD", - "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +def la_agg(input, output): + """ + Joins data from newly cleaned social work workforce census files (output of cleanfile()) to existing social work workforce census files for the depositing local authority + :param input: should specify the input file location, including file name and suffix, and be usable by a Path function + :param output: should specify the path to the output folder + :return: None + """ + + # Configuration + config = agg_config.Config() + + # Open file as DataFrame and match file type + csww_df = agg_process.read_file(input) + print(f"csww_df = {csww_df}") + column_names = config["column_names"] + table_name = agg_process.match_load_file(csww_df, column_names) + + # Merge file with existing file of the same type in LA output folder + csww_df = agg_process.merge_la_files(output, csww_df, table_name) + + # De-duplicate and remove old data according to schema + if table_name == "CSWWWorker": + dates = config["dates"] + csww_df = agg_process.convert_datetimes(csww_df, dates, table_name) + sort_order = config["sort_order"] + dedup = config["dedup"] + csww_df = agg_process.deduplicate(csww_df, table_name, sort_order, dedup) + csww_df = agg_process.remove_old_data( + csww_df, + num_of_years=YEARS_TO_GO_BACK, + new_year_start_month=YEAR_START_MONTH, + as_at_date=REFERENCE_DATE, + ) + + # If file still has data, after removing old data: re-format and export merged file + if len(csww_df) > 0: + if table_name == "CSWWWorker": + csww_df = agg_process.convert_dates(csww_df, dates, table_name) + agg_process.export_la_file(output, table_name, csww_df) + + +# Run in Visual Studio Code |> + +# cleanfile( +# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml", +# "BAD", +# "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# ) + +# la_agg( +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2021_worker_clean.csv", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# ) + +la_agg( + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", ) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_la_agg/configuration.py b/liiatools/datasets/social_work_workforce/lds_csww_la_agg/configuration.py new file mode 100644 index 00000000..5bb508b9 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_la_agg/configuration.py @@ -0,0 +1,75 @@ +from pathlib import Path +import logging +import datetime +import os +import yaml +from string import Template + +from liiatools.spec import social_work_workforce as csww_asset_dir + +log = logging.getLogger(__name__) + +DEFAULT_CONFIG_DIR = Path(csww_asset_dir.__file__).parent + + +class Config(dict): + def __init__(self, config_file=None): + super().__init__() + + if not config_file: + config_file = DEFAULT_CONFIG_DIR / "la-agg.yml" + + self.load_config(config_file, conditional=False) + + self["config_date"] = datetime.datetime.now().isoformat() + try: + self["username"] = os.getlogin() + except OSError: + # This happens when tests are not run under a login shell, e.g. CI pipeline + pass + + def load_config(self, filename, conditional=False, warn=False): + """ + Load configuration from yaml file. Any loaded configuration + is only set if the values don't already exist in CONFIG. + + Files can contain ${} placeholders following the Python string.Template format. + The context will include any keys already existing in the configuration, any keys + from the current file - however, if these include placeholders, the placeholders + will not be replaced. Finally, environment variables can be referenced with + `os_environ_VARIABLE_NAME`. + + Keyword arguments: + filename -- Filename to load from + conditional -- If True, ignore file if it doesn't exist. If False, fail. (default False) + """ + if conditional and not os.path.isfile(filename): + if warn: + log.warning("Missing optional file {}".format(filename)) + + return + + with open(filename) as FILE: + user_config = yaml.load(FILE, Loader=yaml.FullLoader) + + log.info( + "Loading {} configuration values from '{}'.".format( + len(user_config), filename + ) + ) + + environment_dict = {"os_environ_{}".format(k): v for k, v in os.environ.items()} + + variables = dict(self) + variables.update(user_config) + variables.update(environment_dict) + + with open(filename, "rt") as FILE: + user_config_string = FILE.read() + + user_config_template = Template(user_config_string) + user_config_string = user_config_template.substitute(variables) + + user_config = yaml.load(user_config_string, Loader=yaml.FullLoader) + + self.update(user_config) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_la_agg/process.py b/liiatools/datasets/social_work_workforce/lds_csww_la_agg/process.py new file mode 100644 index 00000000..44d4bd87 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_la_agg/process.py @@ -0,0 +1,97 @@ +from pathlib import Path +import pandas as pd +import logging + +log = logging.getLogger(__name__) + + +def read_file(file): + """ + Reads the csv file as a pandas DataFrame + """ + filepath = Path(file) + csww_df = pd.read_csv(filepath, index_col=None) + return csww_df + + +def match_load_file(csww_df, column_names): + """ + Matches the columns in the DataFrame against one of the 2 social work workforce file types + """ + for table_name, expected_columns in column_names.items(): + if set(csww_df.columns) == set(expected_columns): + return table_name + + +def merge_la_files(output, csww_df, table_name): + """ + Looks for existing file of the same type and merges with new file if found + """ + old_file = Path(output, f"CSWW_{table_name}_merged.csv") + if old_file.is_file(): + old_df = pd.read_csv(old_file, index_col=None) + merged_df = pd.concat([csww_df, old_df], axis=0) + else: + merged_df = csww_df + return merged_df + + +def convert_datetimes(csww_df, dates, table_name): + """ + Ensures that all date fields have been parsed as dates + """ + for date_field in dates[table_name]: + csww_df[date_field] = pd.to_datetime(csww_df[date_field], format="%Y/%m/%d") + return csww_df + + +def deduplicate(csww_df, table_name, sort_order, dedup): + """ + Sorts and removes duplicate records from merged files following schema + """ + csww_df = csww_df.sort_values( + sort_order[table_name], ascending=False, ignore_index=True + ) + csww_df = csww_df.drop_duplicates(subset=dedup[table_name], keep="first") + return csww_df + + +def remove_old_data(csww_df, num_of_years, new_year_start_month, as_at_date): + """ + Removes data older than a specified number of years as at reference date + + :param csww_df: Dataframe containing csv data + :param num_of_years: The number of years to go back + :param new_year_start_month: The month which signifies start of a new year for data retention policy + :param as_at_date: The reference date against which we are checking the valid range + :return: Dataframe with older years removed + """ + current_year = pd.to_datetime(as_at_date).year + current_month = pd.to_datetime(as_at_date).month + + if current_month < new_year_start_month: + earliest_allowed_year = current_year - num_of_years + else: + earliest_allowed_year = current_year - num_of_years + 1 # roll forward one year + + csww_df = csww_df[csww_df["YEAR"] >= earliest_allowed_year] + return csww_df + + +def convert_dates(csww_df, dates, table_name): + """ + Ensures that all date fields have been parsed as dates + """ + for date_field in dates[table_name]: + csww_df[date_field] = pd.to_datetime( + csww_df[date_field], format="%Y/%m/%d" + ).dt.date + return csww_df + + +def export_la_file(output, table_name, csww_df): + """ + Writes the output as a csv + """ + output_path = Path(output, f"CSWW_{table_name}_merged.csv") + csww_df.to_csv(output_path, index=False) diff --git a/liiatools/spec/social_work_workforce/la-agg.yml b/liiatools/spec/social_work_workforce/la-agg.yml new file mode 100644 index 00000000..d3b909ff --- /dev/null +++ b/liiatools/spec/social_work_workforce/la-agg.yml @@ -0,0 +1,67 @@ +column_names: + CSWWWorker: + - AgencyWorker + - SWENo + - FTE + - PersonBirthDate + - GenderCurrent + - Ethnicity + - QualInst + - StepUpGrad + - RoleStartDate + - StartOrigin + - Cases30 + - WorkingDaysLost + - ContractWeeks + - FrontlineGrad + - Absat30Sept + - ReasonAbsence + - CFKSSstatus + - LA + - YEAR + LALevelVacancies: + - NumberOfVacancies + - NoAgencyFTE + - NoAgencyHeadcount + - LA + - YEAR + +dates: + CSWWWorker: + - PersonBirthDate + - RoleStartDate + +sort_order: + CSWWWorker: + - YEAR + LALevelVacancies: + - YEAR + +dedup: + CSWWWorker: + - PersonBirthDate + - GenderCurrent + - Ethnicity + - QualInst + - StepUpGrad + - RoleStartDate + - StartOrigin + - Cases30 + - WorkingDaysLost + - ContractWeeks + - FrontlineGrad + - Absat30Sept + - ReasonAbsence + - CFKSSstatus + - LA + - YEAR + LALevelVacancies: + - NumberOfVacancies + - NoAgencyFTE + - NoAgencyHeadcount + - LA + - YEAR + + + + \ No newline at end of file From 9782042e05f9259ffa9f29c6ed0ea4aae5fe21e7 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 5 Jul 2023 11:22:11 +0000 Subject: [PATCH 12/40] Remove duplicate schema.py from workforce --- .../datasets/social_work_workforce/csww_main_functions.py | 8 ++++---- liiatools/datasets/social_work_workforce/sample_data.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index ced675fa..5ddd167d 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -190,7 +190,7 @@ def la_agg(input, output): # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", # ) -la_agg( - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -) +# la_agg( +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# ) diff --git a/liiatools/datasets/social_work_workforce/sample_data.py b/liiatools/datasets/social_work_workforce/sample_data.py index e8fb749e..3ed1333a 100644 --- a/liiatools/datasets/social_work_workforce/sample_data.py +++ b/liiatools/datasets/social_work_workforce/sample_data.py @@ -5,7 +5,7 @@ from sfdata_stream_parser.events import StartElement, EndElement, TextNode -from liiatools.datasets.social_work_workforce.schema import Schema +from liiatools.datasets.social_work_workforce.lds_csww_clean.schema import Schema def TextElement(tag: str, text): From 27119060bc1f66badd1f45ae26aebdb9f5ac3cbf Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 5 Jul 2023 11:22:34 +0000 Subject: [PATCH 13/40] Remove duplicate schema.py from workforce --- .../datasets/social_work_workforce/schema.py | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 liiatools/datasets/social_work_workforce/schema.py diff --git a/liiatools/datasets/social_work_workforce/schema.py b/liiatools/datasets/social_work_workforce/schema.py deleted file mode 100644 index aebb3226..00000000 --- a/liiatools/datasets/social_work_workforce/schema.py +++ /dev/null @@ -1,18 +0,0 @@ -from functools import cached_property -from pathlib import Path - -from xmlschema import XMLSchema - -from liiatools.spec import social_work_workforce as social_work_workforce_dir - - -class Schema: - def __init__(self, year: int = 2022): - self.__year = year - - @cached_property - def schema(self) -> XMLSchema: - return XMLSchema( - Path(social_work_workforce_dir.__file__).parent - / f"social_work_workforce_{self.__year}.xsd" - ) From 1889518bdf4151da3c7feec688d733bb76cd4868 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 5 Jul 2023 12:16:15 +0000 Subject: [PATCH 14/40] Create separate degrade_data function --- .../social_work_workforce/csww_main_functions.py | 10 ++++++---- .../lds_csww_clean/file_creator.py | 11 ++++++++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 5ddd167d..44548cd7 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -129,10 +129,13 @@ def cleanfile(input, la_code, la_log_dir, output): stream = csww_record.message_collector(stream) data_worker, data_lalevel = csww_record.export_table(stream) + data_worker = file_creator.add_fields(input_year, data_worker, la_name) + data_worker = file_creator.degrade_data(data_worker) + file_creator.export_file(input, output, data_worker, "worker") + data_lalevel = file_creator.add_fields(input_year, data_lalevel, la_name) file_creator.export_file(input, output, data_lalevel, "lalevel") - file_creator.export_file(input, output, data_worker, "worker") def la_agg(input, output): @@ -148,7 +151,6 @@ def la_agg(input, output): # Open file as DataFrame and match file type csww_df = agg_process.read_file(input) - print(f"csww_df = {csww_df}") column_names = config["column_names"] table_name = agg_process.match_load_file(csww_df, column_names) @@ -179,14 +181,14 @@ def la_agg(input, output): # Run in Visual Studio Code |> # cleanfile( -# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml", +# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", # "BAD", # "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", # ) # la_agg( -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2021_worker_clean.csv", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", # ) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py index 3f383dcd..66fe79a0 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py @@ -78,12 +78,21 @@ def add_fields(input_year, data, la_name): :param input_year: A string of the year of return for the current file :param data: The dataframe to be cleaned :param la_name: LA name - :return: Cleaned and degraded dataframe + :return: Dataframe with year and LA added """ data = convert_to_dataframe(data) data = get_year(data, input_year) data = add_la_name(data, la_name) + return data + + +def degrade_data(data): + """ + Degrade DoB to first of month and replace SWENo with hash code version + :param data: The dataframe to be cleaned + :return: Dataframe with degraded data + """ data = convert_to_datetime(data) data = degrade_dob(data) data = degrade_SWENo(data) From b5be8c5f51aca754be7a915dde5626e86c42bca4 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 5 Jul 2023 12:26:15 +0000 Subject: [PATCH 15/40] Copy s903 pan_agg files to workforce --- .../lds_csww_pan_agg/configuration.py | 85 ++++++++++++++ .../lds_csww_pan_agg/process.py | 53 +++++++++ .../spec/social_work_workforce/pan-agg.yml | 109 ++++++++++++++++++ 3 files changed, 247 insertions(+) create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_pan_agg/configuration.py create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_pan_agg/process.py create mode 100644 liiatools/spec/social_work_workforce/pan-agg.yml diff --git a/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/configuration.py b/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/configuration.py new file mode 100644 index 00000000..3d7b6f00 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/configuration.py @@ -0,0 +1,85 @@ +from pathlib import Path +import logging +import datetime +import os +import yaml +from string import Template + +from liiatools.spec import s903 as s903_asset_dir +from liiatools.spec import common as common_asset_dir + +log = logging.getLogger(__name__) + +DEFAULT_CONFIG_DIR = Path(s903_asset_dir.__file__).parent +COMMON_CONFIG_DIR = Path(common_asset_dir.__file__).parent + + +class Config(dict): + def __init__(self, config_files=None): + super().__init__() + + if not config_files: + config_files = [ + "PAN_AGG_CONFIG", + "LA_CODES_CONFIG", + ] + + for file in config_files: + if file == "PAN_AGG_CONFIG": + file = DEFAULT_CONFIG_DIR / "pan-agg.yml" + elif file == "LA_CODES_CONFIG": + file = COMMON_CONFIG_DIR / "LA-codes.yml" + self.load_config(file, conditional=False) + + self["config_date"] = datetime.datetime.now().isoformat() + try: + self["username"] = os.getlogin() + except OSError: + # This happens when tests are not run under a login shell, e.g. CI pipeline + pass + + def load_config(self, filename, conditional=False, warn=False): + """ + Load configuration from yaml file. Any loaded configuration + is only set if the values don't already exist in CONFIG. + + Files can contain ${} placeholders following the Python string.Template format. + The context will include any keys already existing in the configuration, any keys + from the current file - however, if these include placeholders, the placeholders + will not be replaced. Finally, environment variables can be referenced with + `os_environ_VARIABLE_NAME`. + + Keyword arguments: + filename -- Filename to load from + conditional -- If True, ignore file if it doesn't exist. If False, fail. (default False) + """ + if conditional and not os.path.isfile(filename): + if warn: + log.warning("Missing optional file {}".format(filename)) + + return + + with open(filename) as FILE: + user_config = yaml.load(FILE, Loader=yaml.FullLoader) + + log.info( + "Loading {} configuration values from '{}'.".format( + len(user_config), filename + ) + ) + + environment_dict = {"os_environ_{}".format(k): v for k, v in os.environ.items()} + + variables = dict(self) + variables.update(user_config) + variables.update(environment_dict) + + with open(filename, "rt") as FILE: + user_config_string = FILE.read() + + user_config_template = Template(user_config_string) + user_config_string = user_config_template.substitute(variables) + + user_config = yaml.load(user_config_string, Loader=yaml.FullLoader) + + self.update(user_config) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/process.py b/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/process.py new file mode 100644 index 00000000..4417bc92 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/process.py @@ -0,0 +1,53 @@ +from pathlib import Path +import pandas as pd +import logging + +log = logging.getLogger(__name__) + + +def read_file(file): + """ + Reads the csv file as a pandas DataFrame + """ + filepath = Path(file) + s903_df = pd.read_csv(filepath, index_col=None) + return s903_df + + +def match_load_file(s903_df, column_names): + """ + Matches the columns in the DataFrame against one of the 10 SSDA903 file types + """ + for table_name, expected_columns in column_names.items(): + if set(s903_df.columns) == set(expected_columns): + return table_name + + +def _merge_dfs(s903_df, old_df, la_name): + """ + Deletes existing data for new LA from pan file + Merges new LA data to pan file + """ + old_df = old_df.drop(old_df[old_df["LA"] == la_name].index) + s903_df = pd.concat([s903_df, old_df], axis=0, ignore_index=True) + return s903_df + + +def merge_agg_files(output, table_name, s903_df, la_name): + """ + Checks if pan file exists + Passes old and new file to function to be merged + """ + output_file = Path(output, f"pan_London_SSDA903_{table_name}.csv") + if output_file.is_file(): + old_df = pd.read_csv(output_file, index_col=None) + s903_df = _merge_dfs(s903_df, old_df, la_name) + return s903_df + + +def export_pan_file(output, table_name, s903_df): + """ + Writes file to output directory + """ + output_path = Path(output, f"pan_London_SSDA903_{table_name}.csv") + s903_df.to_csv(output_path, index=False) diff --git a/liiatools/spec/social_work_workforce/pan-agg.yml b/liiatools/spec/social_work_workforce/pan-agg.yml new file mode 100644 index 00000000..38b3dee2 --- /dev/null +++ b/liiatools/spec/social_work_workforce/pan-agg.yml @@ -0,0 +1,109 @@ +column_names: + Header: + - CHILD + - SEX + - DOB + - ETHNIC + - UPN + - MOTHER + - MC_DOB + - LA + - YEAR + Episodes: + - CHILD + - DECOM + - RNE + - LS + - CIN + - PLACE + - PLACE_PROVIDER + - DEC + - REC + - REASON_PLACE_CHANGE + - HOME_POST + - PL_POST + - URN + - LA + - YEAR + Reviews: + - CHILD + - DOB + - REVIEW + - REVIEW_CODE + - LA + - YEAR + UASC: + - CHILD + - SEX + - DOB + - DUC + - LA + - YEAR + OC2: + - CHILD + - DOB + - SDQ_SCORE + - SDQ_REASON + - CONVICTED + - HEALTH_CHECK + - IMMUNISATIONS + - TEETH_CHECK + - HEALTH_ASSESSMENT + - SUBSTANCE_MISUSE + - INTERVENTION_RECEIVED + - INTERVENTION_OFFERED + - LA + - YEAR + OC3: + - CHILD + - DOB + - IN_TOUCH + - ACTIV + - ACCOM + - LA + - YEAR + AD1: + - CHILD + - DOB + - DATE_INT + - DATE_MATCH + - FOSTER_CARE + - NB_ADOPTR + - SEX_ADOPTR + - LS_ADOPTR + - LA + - YEAR + PlacedAdoption: + - CHILD + - DOB + - DATE_PLACED + - DATE_PLACED_CEASED + - REASON_PLACED_CEASED + - LA + - YEAR + PrevPerm: + - CHILD + - DOB + - PREV_PERM + - LA_PERM + - DATE_PERM + - LA + - YEAR + Missing: + - CHILD + - DOB + - MISSING + - MIS_START + - MIS_END + - LA + - YEAR + +pan_data_kept: + - Header + - Episodes + - Reviews + - UASC + - OC2 + - OC3 + - PrevPerm + - Missing \ No newline at end of file From 9fdabda90f03286d5827b76f5feca64a0212f3ff Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 5 Jul 2023 13:25:38 +0000 Subject: [PATCH 16/40] Add pan_agg functionality to workforce --- .../social_work_workforce/csww_cli.py | 32 +++++ .../csww_main_functions.py | 67 ++++++--- .../lds_csww_pan_agg/configuration.py | 6 +- .../lds_csww_pan_agg/process.py | 30 ++-- .../spec/social_work_workforce/pan-agg.yml | 134 ++++-------------- 5 files changed, 127 insertions(+), 142 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_cli.py b/liiatools/datasets/social_work_workforce/csww_cli.py index 0b687767..68e8600c 100644 --- a/liiatools/datasets/social_work_workforce/csww_cli.py +++ b/liiatools/datasets/social_work_workforce/csww_cli.py @@ -60,3 +60,35 @@ def la_agg(input, output): :return: None """ csww_main_functions.la_agg(input, output) + + +@csww.command() +@click.option( + "--i", + "input", + required=True, + type=str, + help="A string specifying the input file location, including the file name and suffix, usable by a pathlib Path function", +) +@click.option( + "--la_code", + required=True, + type=click.Choice(la_list, case_sensitive=False), + help="A three letter code, specifying the local authority that deposited the file", +) +@click.option( + "--o", + "output", + required=True, + type=str, + help="A string specifying the output directory location", +) +def pan_agg(input, la_code, output): + """ + Joins data from newly merged social work workforce file (output of la-agg()) to existing pan-London social work workforce data + :param input: should specify the input file location, including file name and suffix, and be usable by a Path function + :param la_code: should be a three-letter string for the local authority depositing the file + :param output: should specify the path to the output folder + :return: None + """ + csww_main_functions.pan_agg(input, la_code, output) \ No newline at end of file diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 44548cd7..da651312 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -39,8 +39,8 @@ ) # dependencies for pan_agg() -# from liiatools.datasets.social_work_workforce.lds_csww_pan_agg import configuration as pan_config -# from liiatools.datasets.social_work_workforce.lds_csww_pan_agg import process as pan_process +from liiatools.datasets.social_work_workforce.lds_csww_pan_agg import configuration as pan_config +from liiatools.datasets.social_work_workforce.lds_csww_pan_agg import process as pan_process COMMON_CONFIG_DIR = Path(common_asset_dir.__file__).parent @@ -178,21 +178,52 @@ def la_agg(input, output): agg_process.export_la_file(output, table_name, csww_df) +def pan_agg(input, la_code, output): + """ + Joins data from newly merged social work workforce file (output of la-agg()) to existing pan-London workforce data + :param input: should specify the input file location, including file name and suffix, and be usable by a Path function + :param la_code: should be a three-letter string for the local authority depositing the file + :param output: should specify the path to the output folder + :return: None + """ + + # Configuration + config = pan_config.Config() + + # Read file and match type + csww_df = pan_process.read_file(input) + column_names = config["column_names"] + table_name = pan_process.match_load_file(csww_df, column_names) + + # Remove unwanted datasets and merge wanted with existing output + pan_data_kept = config["pan_data_kept"] + if table_name in pan_data_kept: + la_name = flip_dict(config["data_codes"])[la_code] + csww_df = pan_process.merge_agg_files(output, table_name, csww_df, la_name) + pan_process.export_pan_file(output, table_name, csww_df) + + # Run in Visual Studio Code |> -# cleanfile( -# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", -# "BAD", -# "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# ) - -# la_agg( -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# ) - -# la_agg( -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# ) +cleanfile( + "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml", + "NEW", + "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +) + +la_agg( + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +) + +la_agg( + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +) + +pan_agg( + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/CSWW_CSWWWorker_merged.csv", + "NEW", + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +) \ No newline at end of file diff --git a/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/configuration.py b/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/configuration.py index 3d7b6f00..f6fd1f53 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/configuration.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/configuration.py @@ -2,15 +2,15 @@ import logging import datetime import os -import yaml from string import Template +import yaml -from liiatools.spec import s903 as s903_asset_dir +from liiatools.spec import social_work_workforce as csww_asset_dir from liiatools.spec import common as common_asset_dir log = logging.getLogger(__name__) -DEFAULT_CONFIG_DIR = Path(s903_asset_dir.__file__).parent +DEFAULT_CONFIG_DIR = Path(csww_asset_dir.__file__).parent COMMON_CONFIG_DIR = Path(common_asset_dir.__file__).parent diff --git a/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/process.py b/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/process.py index 4417bc92..329fe23d 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/process.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_pan_agg/process.py @@ -1,6 +1,6 @@ from pathlib import Path -import pandas as pd import logging +import pandas as pd log = logging.getLogger(__name__) @@ -10,44 +10,44 @@ def read_file(file): Reads the csv file as a pandas DataFrame """ filepath = Path(file) - s903_df = pd.read_csv(filepath, index_col=None) - return s903_df + csww_df = pd.read_csv(filepath, index_col=None) + return csww_df -def match_load_file(s903_df, column_names): +def match_load_file(csww_df, column_names): """ Matches the columns in the DataFrame against one of the 10 SSDA903 file types """ for table_name, expected_columns in column_names.items(): - if set(s903_df.columns) == set(expected_columns): + if set(csww_df.columns) == set(expected_columns): return table_name -def _merge_dfs(s903_df, old_df, la_name): +def _merge_dfs(csww_df, old_df, la_name): """ Deletes existing data for new LA from pan file Merges new LA data to pan file """ old_df = old_df.drop(old_df[old_df["LA"] == la_name].index) - s903_df = pd.concat([s903_df, old_df], axis=0, ignore_index=True) - return s903_df + csww_df = pd.concat([csww_df, old_df], axis=0, ignore_index=True) + return csww_df -def merge_agg_files(output, table_name, s903_df, la_name): +def merge_agg_files(output, table_name, csww_df, la_name): """ Checks if pan file exists Passes old and new file to function to be merged """ - output_file = Path(output, f"pan_London_SSDA903_{table_name}.csv") + output_file = Path(output, f"pan_London_CSWW_{table_name}.csv") if output_file.is_file(): old_df = pd.read_csv(output_file, index_col=None) - s903_df = _merge_dfs(s903_df, old_df, la_name) - return s903_df + csww_df = _merge_dfs(csww_df, old_df, la_name) + return csww_df -def export_pan_file(output, table_name, s903_df): +def export_pan_file(output, table_name, csww_df): """ Writes file to output directory """ - output_path = Path(output, f"pan_London_SSDA903_{table_name}.csv") - s903_df.to_csv(output_path, index=False) + output_path = Path(output, f"pan_London_CSWW_{table_name}.csv") + csww_df.to_csv(output_path, index=False) diff --git a/liiatools/spec/social_work_workforce/pan-agg.yml b/liiatools/spec/social_work_workforce/pan-agg.yml index 38b3dee2..c6bdad4e 100644 --- a/liiatools/spec/social_work_workforce/pan-agg.yml +++ b/liiatools/spec/social_work_workforce/pan-agg.yml @@ -1,109 +1,31 @@ column_names: - Header: - - CHILD - - SEX - - DOB - - ETHNIC - - UPN - - MOTHER - - MC_DOB - - LA - - YEAR - Episodes: - - CHILD - - DECOM - - RNE - - LS - - CIN - - PLACE - - PLACE_PROVIDER - - DEC - - REC - - REASON_PLACE_CHANGE - - HOME_POST - - PL_POST - - URN - - LA - - YEAR - Reviews: - - CHILD - - DOB - - REVIEW - - REVIEW_CODE - - LA - - YEAR - UASC: - - CHILD - - SEX - - DOB - - DUC - - LA - - YEAR - OC2: - - CHILD - - DOB - - SDQ_SCORE - - SDQ_REASON - - CONVICTED - - HEALTH_CHECK - - IMMUNISATIONS - - TEETH_CHECK - - HEALTH_ASSESSMENT - - SUBSTANCE_MISUSE - - INTERVENTION_RECEIVED - - INTERVENTION_OFFERED - - LA - - YEAR - OC3: - - CHILD - - DOB - - IN_TOUCH - - ACTIV - - ACCOM - - LA - - YEAR - AD1: - - CHILD - - DOB - - DATE_INT - - DATE_MATCH - - FOSTER_CARE - - NB_ADOPTR - - SEX_ADOPTR - - LS_ADOPTR - - LA - - YEAR - PlacedAdoption: - - CHILD - - DOB - - DATE_PLACED - - DATE_PLACED_CEASED - - REASON_PLACED_CEASED - - LA - - YEAR - PrevPerm: - - CHILD - - DOB - - PREV_PERM - - LA_PERM - - DATE_PERM - - LA - - YEAR - Missing: - - CHILD - - DOB - - MISSING - - MIS_START - - MIS_END - - LA - - YEAR + CSWWWorker: + - AgencyWorker + - SWENo + - FTE + - PersonBirthDate + - GenderCurrent + - Ethnicity + - QualInst + - StepUpGrad + - RoleStartDate + - StartOrigin + - Cases30 + - WorkingDaysLost + - ContractWeeks + - FrontlineGrad + - Absat30Sept + - ReasonAbsence + - CFKSSstatus + - LA + - YEAR + LALevelVacancies: + - NumberOfVacancies + - NoAgencyFTE + - NoAgencyHeadcount + - LA + - YEAR pan_data_kept: - - Header - - Episodes - - Reviews - - UASC - - OC2 - - OC3 - - PrevPerm - - Missing \ No newline at end of file + - CSWWWorker + - LALevelVacanciespisodes \ No newline at end of file From 1332e12ed1ff86c1127bc309b38c8a4836c33039 Mon Sep 17 00:00:00 2001 From: patrick-troy <58770937+patrick-troy@users.noreply.github.com> Date: Wed, 5 Jul 2023 15:33:58 +0100 Subject: [PATCH 17/40] add start of cleaning functionality --- .../csdatatools/datasets/cincensus/filters.py | 50 +++++++++++++++++-- .../csww_main_functions.py | 8 ++- .../lds_csww_clean/cleaner.py | 30 +++++++++++ .../lds_csww_clean/schema.py | 9 ++++ 4 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py diff --git a/liiatools/csdatatools/datasets/cincensus/filters.py b/liiatools/csdatatools/datasets/cincensus/filters.py index cd34b680..c6d7f313 100644 --- a/liiatools/csdatatools/datasets/cincensus/filters.py +++ b/liiatools/csdatatools/datasets/cincensus/filters.py @@ -1,12 +1,13 @@ import logging from typing import List - +import xml.etree.ElementTree as ET import xmlschema +from xmlschema import XMLSchemaValidatorError + from sfdata_stream_parser.checks import type_check from sfdata_stream_parser import events from sfdata_stream_parser.collectors import collector, block_check from sfdata_stream_parser.filters.generic import streamfilter, pass_event -from xmlschema import XMLSchemaValidatorError log = logging.getLogger(__name__) @@ -29,7 +30,6 @@ def add_context(event, context: List[str]): context.pop() else: local_context = tuple(context) - return event.from_event(event, context=local_context) @@ -54,8 +54,43 @@ def strip_text(event): return None +def _create_category_dict(field: str, file: str): + """ + Create a dictionary containing the different categorical values of a given field to conform categories + e.g. {'category': [{'code': '0', 'name': 'Not an Agency Worker'}, {'code': '1', 'name': 'Agency Worker'}]} + + :param field: Name of the categorical field you want to find the values for + :param file: Path to the .xsd schema containing possible categories + :return: Dictionary of categorical values and potential alternatives + """ + category_dict = {"category": []} + + xsd_xml = ET.parse(file) + search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']" + element = xsd_xml.find(search_elem) + + if element is not None: + search_value = f".//{{http://www.w3.org/2001/XMLSchema}}enumeration" + value = element.findall(search_value) + if value: + for v in value: + code_dict = {"code": v.get("value")} + category_dict["category"].append(code_dict) + + search_doc = f".//{{http://www.w3.org/2001/XMLSchema}}documentation" + documentation = element.findall(search_doc) + for i, d in enumerate(documentation): + name_dict = {"name": d.text} + category_dict["category"][i] = {**category_dict["category"][i], **name_dict} + + return category_dict + + else: + return + + @streamfilter() -def add_schema(event, schema: xmlschema.XMLSchema): +def add_schema(event, schema: xmlschema.XMLSchema, schema_path: str): """ Requires each event to have event.context as set by :func:`add_context` @@ -65,13 +100,18 @@ def add_schema(event, schema: xmlschema.XMLSchema): Provides: path, schema """ + schema_dict = None assert ( event.context ), "This filter required event.context to be set - see add_context" path = "/".join(event.context) tag = event.context[-1] el = schema.get_element(tag, path) - return event.from_event(event, path=path, schema=el) + + if el.type.name is not None and el.type.name[-4:] == "type": + schema_dict = _create_category_dict(el.type.name, schema_path) + + return event.from_event(event, path=path, schema=el, schema_dict=schema_dict) def _get_validation_error(schema, node) -> XMLSchemaValidatorError: diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 44548cd7..3abf6bad 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -11,12 +11,13 @@ # Dependencies for cleanfile() from liiatools.csdatatools.util.xml import dom_parse from liiatools.csdatatools.datasets.cincensus import filters -from liiatools.datasets.social_work_workforce.lds_csww_clean.schema import Schema +from liiatools.datasets.social_work_workforce.lds_csww_clean.schema import Schema, FilePath from liiatools.datasets.social_work_workforce.lds_csww_clean import ( file_creator, configuration as clean_config, csww_record, + cleaner, ) from liiatools.spec import common as common_asset_dir @@ -123,7 +124,10 @@ def cleanfile(input, la_code, la_log_dir, output): la_name = flip_dict(config["data_codes"])[la_code] stream = filters.strip_text(stream) stream = filters.add_context(stream) - stream = filters.add_schema(stream, schema=Schema(input_year).schema) + stream = filters.add_schema(stream, schema=Schema(input_year).schema, schema_path=FilePath(input_year).path) + + # Clean stream + stream = cleaner.clean_dates(stream, schema=FilePath(input_year).path) # Output results stream = csww_record.message_collector(stream) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py new file mode 100644 index 00000000..0ecc10ee --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -0,0 +1,30 @@ +import logging + +from sfdata_stream_parser.checks import type_check +from sfdata_stream_parser import events +from sfdata_stream_parser.filters.generic import streamfilter, pass_event + +from liiatools.datasets.s903.lds_ssda903_clean.converters import to_category, to_integer + +from liiatools.datasets.shared_functions.converters import to_date +from liiatools.datasets.shared_functions.common import check_postcode + +log = logging.getLogger(__name__) + + +@streamfilter( + check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event +) +def clean_dates(event): + """ + Convert all values that should be dates to dates based on the config.yaml file + + :param event: A filtered list of event objects of type Cell + :return: An updated list of event objects + """ + date = event.config_dict["date"] + try: + text = to_date(event.cell, date) + return event.from_event(event, cell=text, error="0") + except (AttributeError, TypeError, ValueError): + return event.from_event(event, cell="", error="1") diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py index f82b5eb1..cf9c6436 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py @@ -6,6 +6,15 @@ from liiatools.spec import social_work_workforce as csww_asset_dir +class FilePath: + def __init__(self, year): + self.__year = year + + @cached_property + def path(self): + return Path(csww_asset_dir.__file__).parent / f"social_work_workforce_{self.__year}.xsd" + + class Schema: def __init__(self, year): self.__year = year From 732541bbc8aeda5a2423c2969944e87bd78d70bc Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:34:47 +0000 Subject: [PATCH 18/40] Implement cli cleanfile functionality in workforce --- .../social_work_workforce/csww_cli.py | 56 +- .../csww_main_functions.py | 68 ++- .../lds_csww_clean/cleaner.py | 25 +- ...cial_work_workforce_2022_lalevel_clean.csv | 2 + ...ocial_work_workforce_2022_worker_clean.csv | 31 + .../csww/NEW/social_work_workforce_2022.xml | 556 ++++++++++++++++++ 6 files changed, 702 insertions(+), 36 deletions(-) create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv create mode 100644 liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml diff --git a/liiatools/datasets/social_work_workforce/csww_cli.py b/liiatools/datasets/social_work_workforce/csww_cli.py index 68e8600c..99f7ca33 100644 --- a/liiatools/datasets/social_work_workforce/csww_cli.py +++ b/liiatools/datasets/social_work_workforce/csww_cli.py @@ -1,12 +1,20 @@ import logging -import click as click +from pathlib import Path +import click +import yaml import click_log from liiatools.datasets.social_work_workforce import csww_main_functions +from liiatools.spec import common as common_asset_dir -logger = logging.getLogger() -click_log.basic_config(logger) +log = logging.getLogger() +click_log.basic_config(log) + +COMMON_CONFIG_DIR = Path(common_asset_dir.__file__).parent +# Get all the possible LA codes that could be used +with open(f"{COMMON_CONFIG_DIR}/LA-codes.yml") as las: + la_list = list(yaml.full_load(las)["data_codes"].values()) @click.group() @@ -16,6 +24,46 @@ def csww(): """ pass +@csww.command() +@click.option( + "--i", + "input", + required=True, + type=str, + help="A string specifying the input file location, including the file name and suffix, usable by a pathlib Path function", +) +@click.option( + "--la_code", + required=True, + type=click.Choice(la_list, case_sensitive=False), + help="A three letter code, specifying the local authority that deposited the file", +) +@click.option( + "--la_log_dir", + required=True, + type=str, + help="A string specifying the location that the log files for the LA should be output, usable by a pathlib Path function.", +) +@click.option( + "--o", + "output", + required=True, + type=str, + help="A string specifying the output directory location", +) +@click_log.simple_verbosity_option(log) +def cleanfile(input, la_code, la_log_dir, output): + """ + Cleans input social work workforce xml files according to config and outputs cleaned csv files. + :param input: should specify the input file location, including file name and suffix, and be usable by a Path function + :param la_code: should be a three-letter string for the local authority depositing the file + :param la_log_dir: should specify the path to the local authority's log folder + :param output: should specify the path to the output folder + :return: None + """ + output = csww_main_functions.cleanfile(input, la_code, la_log_dir, output) + return output + @csww.command() @click.option( @@ -25,7 +73,7 @@ def csww(): type=str, help="A string specifying the output file location, including the file name and suffix", ) -@click_log.simple_verbosity_option(logger) +@click_log.simple_verbosity_option(log) def generate_sample(output: str): """ Export a sample file for testing diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index cdc19aec..04833111 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -6,12 +6,14 @@ generate_sample_csww_file, ) from liiatools.csdatatools.util.stream import consume -from liiatools.csdatatools.util.xml import etree, to_xml +from liiatools.csdatatools.util.xml import etree, to_xml, dom_parse # Dependencies for cleanfile() -from liiatools.csdatatools.util.xml import dom_parse from liiatools.csdatatools.datasets.cincensus import filters -from liiatools.datasets.social_work_workforce.lds_csww_clean.schema import Schema, FilePath +from liiatools.datasets.social_work_workforce.lds_csww_clean.schema import ( + Schema, + FilePath, +) from liiatools.datasets.social_work_workforce.lds_csww_clean import ( file_creator, @@ -34,14 +36,14 @@ # dependencies for la_agg() from liiatools.datasets.social_work_workforce.lds_csww_la_agg import ( configuration as agg_config, -) -from liiatools.datasets.social_work_workforce.lds_csww_la_agg import ( process as agg_process, ) # dependencies for pan_agg() -from liiatools.datasets.social_work_workforce.lds_csww_pan_agg import configuration as pan_config -from liiatools.datasets.social_work_workforce.lds_csww_pan_agg import process as pan_process +from liiatools.datasets.social_work_workforce.lds_csww_pan_agg import ( + configuration as pan_config, + process as pan_process, +) COMMON_CONFIG_DIR = Path(common_asset_dir.__file__).parent @@ -124,10 +126,14 @@ def cleanfile(input, la_code, la_log_dir, output): la_name = flip_dict(config["data_codes"])[la_code] stream = filters.strip_text(stream) stream = filters.add_context(stream) - stream = filters.add_schema(stream, schema=Schema(input_year).schema, schema_path=FilePath(input_year).path) + stream = filters.add_schema( + stream, schema=Schema(input_year).schema, schema_path=FilePath(input_year).path + ) # Clean stream - stream = cleaner.clean_dates(stream, schema=FilePath(input_year).path) + #stream = cleaner.clean_dates(stream) + #stream = cleaner.clean_categories(stream) + # Output results stream = csww_record.message_collector(stream) @@ -209,25 +215,25 @@ def pan_agg(input, la_code, output): # Run in Visual Studio Code |> -cleanfile( - "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml", - "NEW", - "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -) - -la_agg( - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -) - -la_agg( - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -) - -pan_agg( - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/CSWW_CSWWWorker_merged.csv", - "NEW", - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -) \ No newline at end of file +# cleanfile( +# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", +# "BAD", +# "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# ) + +# la_agg( +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# ) + +# la_agg( +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# ) + +# pan_agg( +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/CSWW_CSWWWorker_merged.csv", +# "BAD", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# ) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index 0ecc10ee..70e41079 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -7,7 +7,7 @@ from liiatools.datasets.s903.lds_ssda903_clean.converters import to_category, to_integer from liiatools.datasets.shared_functions.converters import to_date -from liiatools.datasets.shared_functions.common import check_postcode +#from liiatools.datasets.shared_functions.common import check_postcode log = logging.getLogger(__name__) @@ -22,9 +22,32 @@ def clean_dates(event): :param event: A filtered list of event objects of type Cell :return: An updated list of event objects """ + #print("running clean_dates") + #print(f"running clean_dates with date: {event.config_dict['date']}") date = event.config_dict["date"] try: text = to_date(event.cell, date) return event.from_event(event, cell=text, error="0") except (AttributeError, TypeError, ValueError): return event.from_event(event, cell="", error="1") + + +@streamfilter( + check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event +) +def clean_categories(event): + """ + Convert all values that should be categories to categories based on the config.yaml file + + :param event: A filtered list of event objects of type Cell + :return: An updated list of event objects + """ + category = event.config_dict["category"] + try: + text = to_category(event.cell, category) + if text != "error": + return event.from_event(event, cell=text, error="0") + else: + return event.from_event(event, cell="", error="1") + except (AttributeError, TypeError, ValueError): + return event.from_event(event, cell="", error="1") \ No newline at end of file diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv b/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv new file mode 100644 index 00000000..912fc8a2 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv @@ -0,0 +1,2 @@ +NumberOfVacancies,NoAgencyFTE,NoAgencyHeadcount,YEAR,LA +79.68,59.82,71,2022,Barking and Dagenham diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv b/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv new file mode 100644 index 00000000..d00f569c --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv @@ -0,0 +1,31 @@ +AgencyWorker,SWENo,FTE,PersonBirthDate,GenderCurrent,Ethnicity,QualInst,StepUpGrad,RoleStartDate,StartOrigin,Cases30,WorkingDaysLost,ContractWeeks,FrontlineGrad,Absat30Sept,ReasonAbsence,CFKSSstatus,YEAR,LA +1,93df82943c4e0757d41565d23ecb26ecd4e146e752cf9a13f80fde8c44cfee47,0.521371,1967-01-01,1,REFU,Institution Name,0,1988-04-07,9,72,15.31,288.7,1,0,TRN,1,2022,Barking and Dagenham +1,be96a5ee5b7cc0caff4ba0452bb7d9e07d920d586912fba81d8655b373c25e04,0,1958-04-01,9,,Institution Name,1,,8,66,29.87,2.5,1,,,4,2022,Barking and Dagenham +1,a0848103de9608a0cf1daf5e0919a81b9eaf1c315bc350348e2a8124d09762fa,0,1984-01-01,,APKN,Institution Name,0,2014-01-26,4,,92.56,213.4,0,,,2,2022,Barking and Dagenham +0,aca59cf1fc90a06d57abaf1e396e74ec645df9755b78a8b9b5fdc2ac1dcc3bf2,0.899676,1990-09-01,9,BAFR,,0,2023-03-28,2,,14.39,,0,,,,2022,Barking and Dagenham +1,b5b3c4b99d2d644b05e28989a2c3c3dddfbbdb0e479ac5c7bb9d2e77333a4f53,0.133587,,2,AIND,Institution Name,1,2017-06-10,,,5.39,475.7,,1,UNP,1,2022,Barking and Dagenham +1,8e6ca61bfea2d19acf090550f277dc566ee5908fb7d0c0c45bd11abbbf04c049,0,1993-05-01,9,,,,2020-06-14,3,11,,141.0,1,,,,2022,Barking and Dagenham +1,1ce5045eb004aa120117e90a0eb2aea4aa38523be9cedb37d2621bcbcbba329c,0.803122,,,WBRI,Institution Name,,,,,95.06,403.6,0,,,2,2022,Barking and Dagenham +1,56ea36a6ef0a6e8e0bf43643363260aaa133e106d40d76c4ef8b677c0ed65f04,0.767688,1996-08-01,0,MWAS,,1,2023-03-28,,62,,,1,,,2,2022,Barking and Dagenham +0,b41b26ea7649687be2f38cec5274cacea9092418177a5270a6aa2343b04ae22e,0.843488,1959-04-01,2,APKN,Institution Name,0,1997-10-01,2,,16.74,456.3,0,,,4,2022,Barking and Dagenham +1,4c9a07fd1a15975784ffaa889a44c906ebc56fec34de4deeecdfafde430add1a,0,1971-10-01,,,Institution Name,1,1993-10-04,,45,22.98,441.5,0,,,3,2022,Barking and Dagenham +1,8e23e51271599fb562e2db2756034cf7b954ae51a654c020bdea17ff439c68e2,0.12232,1996-06-01,1,BAFR,,,,10,23,36.13,213.1,0,,,,2022,Barking and Dagenham +0,6ac44220390e4481616f4df2cfe2cb2471e420acc53168214b21bb4dc410a2a0,0,1987-10-01,,,,0,2012-10-02,2,37,90.85,28.5,1,1,UNA,1,2022,Barking and Dagenham +1,e974f0b0b3a6b797b3acccacaf8489bdf4b92ad5546f7595cf14fb451599ac57,0,1968-11-01,,ABAN,,0,2018-08-03,9,,43.02,154.7,1,,,1,2022,Barking and Dagenham +0,0b453303370012999ca7de11249dcb2aa0c2c94210cf4c51529426198cad8111,0,,0,AOTH,Institution Name,0,2015-04-24,9,,3.51,424.0,,,,2,2022,Barking and Dagenham +0,14d5de9740886b7a7dd72877b1d39971f66d1f7a0c364fc68c8176aa582127fa,0.859218,1968-12-01,0,BCRB,Institution Name,1,2003-09-12,5,85,98.22,206.6,,,,2,2022,Barking and Dagenham +0,fb48d144eda11163fa890ed4663e0463be3902d19837f28dadba08235d371496,0.021911,1977-06-01,,REFU,Institution Name,1,2022-10-08,6,25,29.19,,1,1,SIC,,2022,Barking and Dagenham +1,25f8e1c67c130e33a250ec3216cfe3630c15ffe7fc12066b94d5c90244878c1a,0.786453,1996-11-01,0,MWAS,Institution Name,0,2023-03-28,1,83,78.29,364.4,1,,,,2022,Barking and Dagenham +0,f4b772fd07a4b01a9a3f88808ed073fe5b299f2cf4be244883154614dbd8ae5c,0.491425,1998-04-01,1,,,0,2023-03-28,4,,3.1,415.3,,,,4,2022,Barking and Dagenham +1,c569dae7a53fce2755b29a979664734b18275c28b7f5817f71b65f5776e04e64,0.192894,1959-09-01,0,REFU,Institution Name,1,1985-12-12,9,14,,188.4,0,,,1,2022,Barking and Dagenham +0,9e56119a6e4f2760d518f5b9537d81f6de943c1144cce67e1f14f6ad44e81dd2,0,1962-11-01,,WBRI,,1,1999-07-14,,65,16.26,,1,,,,2022,Barking and Dagenham +1,5a075ae2486f88a2b944784d643f2ad5f06e2263bbe65c6c4db8a07b20f7d1c7,0.862474,1992-02-01,0,WBRI,,1,2023-03-28,4,39,,,0,,,2,2022,Barking and Dagenham +1,256eb6f0eda1d17c6cf4c9cd5b7ff965e302218b2dc6ffe681269e52faab0e29,0,2001-10-01,1,AIND,,0,,1,29,38.71,339.9,0,,,,2022,Barking and Dagenham +1,79f9865feea17900c27cf27e98ba15de977887a4844241ed019874259ad7ef6a,0.668266,1983-04-01,9,,Institution Name,1,2023-03-28,3,,,,1,,,2,2022,Barking and Dagenham +0,34ec568120a609fa591dcf9c7b5331640f01ef262f379a8816a2876bceb0b191,0.978729,1958-04-01,9,MWBA,Institution Name,,2002-01-31,1,,,121.9,1,,,3,2022,Barking and Dagenham +1,d258de55dec2e6165c8fb23ceeaf4777bfde025c4c9f0e4d5a1eca26d481a1e2,0,1989-06-01,1,,,1,2011-08-31,5,70,12.2,301.3,0,,,4,2022,Barking and Dagenham +0,d441374724414d204880c95117ef87c169a6aba7eaddd1ed4a613244a87b75e4,0,1981-09-01,,CHNE,,1,2001-02-10,6,,94.67,471.5,0,,,2,2022,Barking and Dagenham +1,206e97c471f4d5221f8db5aa404d02939e28e13591da37cba903e9f2e1ecc3cd,0.561974,,,OOTH,Institution Name,0,2014-09-30,4,63,87.59,,,0,SIC,1,2022,Barking and Dagenham +0,c5c93de4d0858abb1382d1af403cf9d2e1e87e650fce313b36e0e84bbac0f8b4,0,,,,Institution Name,0,1993-04-18,3,80,,299.1,0,,,4,2022,Barking and Dagenham +0,6e4c8b306eae10888de2dbc28c334f74201a1e7f41da4cac9f71e65dff211ef8,0.034436,,2,APKN,Institution Name,,1997-01-11,4,23,83.01,,0,,,3,2022,Barking and Dagenham +1,170646f335a0f93ad112764dcd11fe7a4890f08aefb28d1696807e37e882153d,0,1975-01-01,9,WOTH,Institution Name,0,2016-08-20,9,87,13.01,,1,,,,2022,Barking and Dagenham diff --git a/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml b/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml new file mode 100644 index 00000000..3cb567fa --- /dev/null +++ b/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml @@ -0,0 +1,556 @@ + +
+ + CSWW + 2022 + 2022-09-30 + + + L + 314 + liiatools.datasets.social_work_workforce.sample_data + 2023-03-28T14:54:55Z + +
+ + 99.68 + 75.82 + 142 + + + 1 + Oy2054309383 + 0.521371 + 1977-08-10 + 1 + REFU + Institution Name + 0 + 1988-04-07 + 9 + 72 + 15.31 + 288.7 + 1 + 0 + TRN + 1 + + + 1 + Yk7226043359 + 0 + 1958-04-07 + 9 + Institution Name + 1 + 2 + 8 + 2019-09-23 + 7 + 10 + 0.603665 + 66 + 29.87 + 2.5 + 1 + 4 + + + 1 + iP8098309864 + 0 + 1984-01-12 + APKN + Institution Name + 0 + 5 + 2014-01-26 + 4 + 2023-03-28 + 3 + 9 + 0.23246 + 92.56 + 213.4 + 0 + 2 + + + 0 + oP8178849586 + 0.899676 + 1990-09-28 + 9 + BAFR + 1 + 0 + 6 + 2023-03-28 + 2 + 0.429963 + 14.39 + 0 + + + 1 + nH9419631053 + 0.133587 + 2 + AIND + Institution Name + 3 + 1 + 2017-06-10 + 0.436348 + 5.39 + 475.7 + 1 + UNP + 1 + + + 1 + JJ3661684122 + 0 + 1993-05-19 + 9 + 3 + 2020-06-14 + 3 + 2023-03-28 + 1 + 5 + 0.903669 + 11 + 141.0 + 1 + + + 1 + tN2120744892 + 0.803122 + WBRI + Institution Name + 3 + 0.964327 + 95.06 + 403.6 + 0 + 2 + + + 1 + Zo9779760045 + 0.767688 + 1996-08-31 + 0 + MWAS + 2 + 1 + 5 + 2023-03-28 + 62 + 1 + 2 + + + 0 + wf3752370095 + 0.843488 + 1959-04-17 + 2 + APKN + Institution Name + 3 + 0 + 1997-10-01 + 2 + 0.712824 + 16.74 + 456.3 + 0 + 4 + + + 1 + OW2475789301 + 0 + 1971-10-02 + Institution Name + 1 + 1 + 1993-10-04 + 2023-03-28 + 3 + 6 + 0.908092 + 45 + 22.98 + 441.5 + 0 + 3 + + + 1 + Kv3016593719 + 0.12232 + 1996-06-05 + 1 + BAFR + 2 + 6 + 10 + 0.641824 + 23 + 36.13 + 213.1 + 0 + + + 0 + TB9669555723 + 0 + 1987-10-30 + 1 + 0 + 2 + 2012-10-02 + 2 + 2023-03-28 + 6 + 7 + 37 + 90.85 + 28.5 + 1 + 1 + UNA + 1 + + + 1 + QK8499162867 + 0 + 1968-11-27 + ABAN + 2 + 0 + 6 + 2018-08-03 + 9 + 2023-03-28 + 2 + 9 + 0.078464 + 43.02 + 154.7 + 1 + 1 + + + 0 + Wr5514040878 + 0 + 0 + AOTH + Institution Name + 1 + 0 + 2 + 2015-04-24 + 9 + 2023-03-28 + 1 + 6 + 3.51 + 424.0 + 2 + + + 0 + Aj9242652291 + 0.859218 + 1968-12-31 + 0 + BCRB + Institution Name + 1 + 1 + 1 + 2003-09-12 + 5 + 0.320526 + 85 + 98.22 + 206.6 + 2 + + + 0 + Jv2635496195 + 0.021911 + 1977-06-27 + REFU + Institution Name + 1 + 6 + 2022-10-08 + 6 + 0.69819 + 25 + 29.19 + 1 + 1 + SIC + + + 1 + To5555885076 + 0.786453 + 1996-11-18 + 0 + MWAS + Institution Name + 3 + 0 + 4 + 2023-03-28 + 1 + 0.441344 + 83 + 78.29 + 364.4 + 1 + + + 0 + rK9218104079 + 0.491425 + 1998-04-15 + 1 + 3 + 0 + 2023-03-28 + 4 + 0.939826 + 3.1 + 415.3 + 4 + + + 1 + cD9282390165 + 0.192894 + 1959-09-25 + 0 + REFU + Institution Name + 2 + 1 + 3 + 1985-12-12 + 9 + 0.18449 + 14 + 188.4 + 0 + 1 + + + 0 + zU6140515687 + 0 + 1962-11-04 + WBRI + 3 + 1 + 3 + 1999-07-14 + 2018-08-20 + 8 + 3 + 0.222573 + 65 + 16.26 + 1 + + + 1 + ih3342923522 + 0.862474 + 1992-02-18 + 0 + WBRI + 3 + 1 + 2023-03-28 + 4 + 0.761443 + 39 + 0 + 2 + + + 1 + cm3809724991 + 0 + 2001-10-29 + 1 + AIND + 3 + 0 + 1 + 2023-03-28 + 5 + 4 + 0.530908 + 29 + 38.71 + 339.9 + 0 + + + 1 + PA8564166424 + 0.668266 + 1983-04-13 + 9 + Institution Name + 1 + 1 + 2023-03-28 + 3 + 0.707445 + 1 + 2 + + + 0 + QW8564363911 + 0.978729 + 1958-04-26 + 9 + MWBA + Institution Name + 1 + 2002-01-31 + 1 + 0.698641 + 121.9 + 1 + 3 + + + 1 + PQ5842914246 + 0 + 1989-06-05 + 1 + 1 + 1 + 2011-08-31 + 5 + 2023-03-28 + 7 + 9 + 0.443976 + 70 + 12.2 + 301.3 + 0 + 4 + + + 0 + ZQ9393137749 + 0 + 1981-09-21 + CHNE + 1 + 1 + 2001-02-10 + 6 + 2023-03-28 + 4 + 1 + 0.821627 + 94.67 + 471.5 + 0 + 2 + + + 1 + Pv9093835426 + 0.561974 + OOTH + Institution Name + 2 + 0 + 6 + 2014-09-30 + 4 + 0.965936 + 63 + 87.59 + 0 + SIC + 1 + + + 0 + eW7601111729 + 0 + Institution Name + 3 + 0 + 1 + 1993-04-18 + 3 + 2023-03-28 + 7 + 1 + 0.63075 + 80 + 299.1 + 0 + 4 + + + 0 + Jd1465867330 + 0.034436 + 2 + APKN + Institution Name + 1 + 1997-01-11 + 4 + 0.22182 + 23 + 83.01 + 0 + 3 + + + 1 + od1620971821 + 0 + 1975-01-19 + 9 + WOTH + Institution Name + 2 + 0 + 1 + 2016-08-20 + 9 + 2023-03-28 + 5 + 9 + 87 + 13.01 + 1 + +
\ No newline at end of file From f77f196207e8331c2d0202c0c5c1e0e15a292214 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:37:10 +0000 Subject: [PATCH 19/40] Remove csv files --- ...cial_work_workforce_2022_lalevel_clean.csv | 2 -- ...ocial_work_workforce_2022_worker_clean.csv | 31 ------------------- 2 files changed, 33 deletions(-) delete mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv delete mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv b/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv deleted file mode 100644 index 912fc8a2..00000000 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv +++ /dev/null @@ -1,2 +0,0 @@ -NumberOfVacancies,NoAgencyFTE,NoAgencyHeadcount,YEAR,LA -79.68,59.82,71,2022,Barking and Dagenham diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv b/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv deleted file mode 100644 index d00f569c..00000000 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv +++ /dev/null @@ -1,31 +0,0 @@ -AgencyWorker,SWENo,FTE,PersonBirthDate,GenderCurrent,Ethnicity,QualInst,StepUpGrad,RoleStartDate,StartOrigin,Cases30,WorkingDaysLost,ContractWeeks,FrontlineGrad,Absat30Sept,ReasonAbsence,CFKSSstatus,YEAR,LA -1,93df82943c4e0757d41565d23ecb26ecd4e146e752cf9a13f80fde8c44cfee47,0.521371,1967-01-01,1,REFU,Institution Name,0,1988-04-07,9,72,15.31,288.7,1,0,TRN,1,2022,Barking and Dagenham -1,be96a5ee5b7cc0caff4ba0452bb7d9e07d920d586912fba81d8655b373c25e04,0,1958-04-01,9,,Institution Name,1,,8,66,29.87,2.5,1,,,4,2022,Barking and Dagenham -1,a0848103de9608a0cf1daf5e0919a81b9eaf1c315bc350348e2a8124d09762fa,0,1984-01-01,,APKN,Institution Name,0,2014-01-26,4,,92.56,213.4,0,,,2,2022,Barking and Dagenham -0,aca59cf1fc90a06d57abaf1e396e74ec645df9755b78a8b9b5fdc2ac1dcc3bf2,0.899676,1990-09-01,9,BAFR,,0,2023-03-28,2,,14.39,,0,,,,2022,Barking and Dagenham -1,b5b3c4b99d2d644b05e28989a2c3c3dddfbbdb0e479ac5c7bb9d2e77333a4f53,0.133587,,2,AIND,Institution Name,1,2017-06-10,,,5.39,475.7,,1,UNP,1,2022,Barking and Dagenham -1,8e6ca61bfea2d19acf090550f277dc566ee5908fb7d0c0c45bd11abbbf04c049,0,1993-05-01,9,,,,2020-06-14,3,11,,141.0,1,,,,2022,Barking and Dagenham -1,1ce5045eb004aa120117e90a0eb2aea4aa38523be9cedb37d2621bcbcbba329c,0.803122,,,WBRI,Institution Name,,,,,95.06,403.6,0,,,2,2022,Barking and Dagenham -1,56ea36a6ef0a6e8e0bf43643363260aaa133e106d40d76c4ef8b677c0ed65f04,0.767688,1996-08-01,0,MWAS,,1,2023-03-28,,62,,,1,,,2,2022,Barking and Dagenham -0,b41b26ea7649687be2f38cec5274cacea9092418177a5270a6aa2343b04ae22e,0.843488,1959-04-01,2,APKN,Institution Name,0,1997-10-01,2,,16.74,456.3,0,,,4,2022,Barking and Dagenham -1,4c9a07fd1a15975784ffaa889a44c906ebc56fec34de4deeecdfafde430add1a,0,1971-10-01,,,Institution Name,1,1993-10-04,,45,22.98,441.5,0,,,3,2022,Barking and Dagenham -1,8e23e51271599fb562e2db2756034cf7b954ae51a654c020bdea17ff439c68e2,0.12232,1996-06-01,1,BAFR,,,,10,23,36.13,213.1,0,,,,2022,Barking and Dagenham -0,6ac44220390e4481616f4df2cfe2cb2471e420acc53168214b21bb4dc410a2a0,0,1987-10-01,,,,0,2012-10-02,2,37,90.85,28.5,1,1,UNA,1,2022,Barking and Dagenham -1,e974f0b0b3a6b797b3acccacaf8489bdf4b92ad5546f7595cf14fb451599ac57,0,1968-11-01,,ABAN,,0,2018-08-03,9,,43.02,154.7,1,,,1,2022,Barking and Dagenham -0,0b453303370012999ca7de11249dcb2aa0c2c94210cf4c51529426198cad8111,0,,0,AOTH,Institution Name,0,2015-04-24,9,,3.51,424.0,,,,2,2022,Barking and Dagenham -0,14d5de9740886b7a7dd72877b1d39971f66d1f7a0c364fc68c8176aa582127fa,0.859218,1968-12-01,0,BCRB,Institution Name,1,2003-09-12,5,85,98.22,206.6,,,,2,2022,Barking and Dagenham -0,fb48d144eda11163fa890ed4663e0463be3902d19837f28dadba08235d371496,0.021911,1977-06-01,,REFU,Institution Name,1,2022-10-08,6,25,29.19,,1,1,SIC,,2022,Barking and Dagenham -1,25f8e1c67c130e33a250ec3216cfe3630c15ffe7fc12066b94d5c90244878c1a,0.786453,1996-11-01,0,MWAS,Institution Name,0,2023-03-28,1,83,78.29,364.4,1,,,,2022,Barking and Dagenham -0,f4b772fd07a4b01a9a3f88808ed073fe5b299f2cf4be244883154614dbd8ae5c,0.491425,1998-04-01,1,,,0,2023-03-28,4,,3.1,415.3,,,,4,2022,Barking and Dagenham -1,c569dae7a53fce2755b29a979664734b18275c28b7f5817f71b65f5776e04e64,0.192894,1959-09-01,0,REFU,Institution Name,1,1985-12-12,9,14,,188.4,0,,,1,2022,Barking and Dagenham -0,9e56119a6e4f2760d518f5b9537d81f6de943c1144cce67e1f14f6ad44e81dd2,0,1962-11-01,,WBRI,,1,1999-07-14,,65,16.26,,1,,,,2022,Barking and Dagenham -1,5a075ae2486f88a2b944784d643f2ad5f06e2263bbe65c6c4db8a07b20f7d1c7,0.862474,1992-02-01,0,WBRI,,1,2023-03-28,4,39,,,0,,,2,2022,Barking and Dagenham -1,256eb6f0eda1d17c6cf4c9cd5b7ff965e302218b2dc6ffe681269e52faab0e29,0,2001-10-01,1,AIND,,0,,1,29,38.71,339.9,0,,,,2022,Barking and Dagenham -1,79f9865feea17900c27cf27e98ba15de977887a4844241ed019874259ad7ef6a,0.668266,1983-04-01,9,,Institution Name,1,2023-03-28,3,,,,1,,,2,2022,Barking and Dagenham -0,34ec568120a609fa591dcf9c7b5331640f01ef262f379a8816a2876bceb0b191,0.978729,1958-04-01,9,MWBA,Institution Name,,2002-01-31,1,,,121.9,1,,,3,2022,Barking and Dagenham -1,d258de55dec2e6165c8fb23ceeaf4777bfde025c4c9f0e4d5a1eca26d481a1e2,0,1989-06-01,1,,,1,2011-08-31,5,70,12.2,301.3,0,,,4,2022,Barking and Dagenham -0,d441374724414d204880c95117ef87c169a6aba7eaddd1ed4a613244a87b75e4,0,1981-09-01,,CHNE,,1,2001-02-10,6,,94.67,471.5,0,,,2,2022,Barking and Dagenham -1,206e97c471f4d5221f8db5aa404d02939e28e13591da37cba903e9f2e1ecc3cd,0.561974,,,OOTH,Institution Name,0,2014-09-30,4,63,87.59,,,0,SIC,1,2022,Barking and Dagenham -0,c5c93de4d0858abb1382d1af403cf9d2e1e87e650fce313b36e0e84bbac0f8b4,0,,,,Institution Name,0,1993-04-18,3,80,,299.1,0,,,4,2022,Barking and Dagenham -0,6e4c8b306eae10888de2dbc28c334f74201a1e7f41da4cac9f71e65dff211ef8,0.034436,,2,APKN,Institution Name,,1997-01-11,4,23,83.01,,0,,,3,2022,Barking and Dagenham -1,170646f335a0f93ad112764dcd11fe7a4890f08aefb28d1696807e37e882153d,0,1975-01-01,9,WOTH,Institution Name,0,2016-08-20,9,87,13.01,,1,,,,2022,Barking and Dagenham From 034f7a2d1cdbd4708e9d9b192e75f307f5b5d0fb Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Fri, 7 Jul 2023 07:50:52 +0000 Subject: [PATCH 20/40] Implement workforce cli; start adding cleansing --- .../csww_main_functions.py | 16 +++++----- .../lds_csww_clean/cleaner.py | 31 +++++++++---------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 04833111..9dc8b5fb 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -131,8 +131,8 @@ def cleanfile(input, la_code, la_log_dir, output): ) # Clean stream - #stream = cleaner.clean_dates(stream) - #stream = cleaner.clean_categories(stream) + stream = cleaner.clean_categories(stream) + stream = cleaner.clean_dates(stream) # Output results @@ -215,12 +215,12 @@ def pan_agg(input, la_code, output): # Run in Visual Studio Code |> -# cleanfile( -# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", -# "BAD", -# "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# ) +cleanfile( + "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", + "BAD", + "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +) # la_agg( # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index 70e41079..d0596cd7 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -17,19 +17,17 @@ ) def clean_dates(event): """ - Convert all values that should be dates to dates based on the config.yaml file + Convert all values that should be dates to dates based on the schema xsd file - :param event: A filtered list of event objects of type Cell + :param event: A filtered list of event objects of type text :return: An updated list of event objects """ - #print("running clean_dates") - #print(f"running clean_dates with date: {event.config_dict['date']}") - date = event.config_dict["date"] + date = event.schema_dict["date"] try: - text = to_date(event.cell, date) - return event.from_event(event, cell=text, error="0") + newtext = to_date(event.text, date) + return event.from_event(event, text=f"xDATEx{newtext}", error="0") except (AttributeError, TypeError, ValueError): - return event.from_event(event, cell="", error="1") + return event.from_event(event, text="", error="1") @streamfilter( @@ -37,17 +35,16 @@ def clean_dates(event): ) def clean_categories(event): """ - Convert all values that should be categories to categories based on the config.yaml file + Convert all values that should be categories to categories based on the schema xsd file - :param event: A filtered list of event objects of type Cell + :param event: A filtered list of event objects of type text :return: An updated list of event objects """ - category = event.config_dict["category"] + category = event.schema_dict["category"] try: - text = to_category(event.cell, category) - if text != "error": - return event.from_event(event, cell=text, error="0") - else: - return event.from_event(event, cell="", error="1") + newtext = to_category(event.text, category) + if newtext != "error": + return event.from_event(event, text=f"xCATx{newtext}", error='0') + return event.from_event(event, text="", error="1") except (AttributeError, TypeError, ValueError): - return event.from_event(event, cell="", error="1") \ No newline at end of file + return event.from_event(event, text="", error="1") \ No newline at end of file From 5509c60b7bba40990180b9082f05b6aa87d37779 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Fri, 7 Jul 2023 11:09:42 +0000 Subject: [PATCH 21/40] Add missing fields to la-agg.yml --- .../social_work_workforce/csww_main_functions.py | 12 ++++++------ .../social_work_workforce/lds_csww_clean/cleaner.py | 4 ++-- liiatools/spec/social_work_workforce/la-agg.yml | 3 +++ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 9dc8b5fb..86ac784c 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -215,12 +215,12 @@ def pan_agg(input, la_code, output): # Run in Visual Studio Code |> -cleanfile( - "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", - "BAD", - "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -) +# cleanfile( +# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", +# "BAD", +# "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# ) # la_agg( # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index d0596cd7..b85b5da1 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -25,7 +25,7 @@ def clean_dates(event): date = event.schema_dict["date"] try: newtext = to_date(event.text, date) - return event.from_event(event, text=f"xDATEx{newtext}", error="0") + return event.from_event(event, text=newtext, error="0") except (AttributeError, TypeError, ValueError): return event.from_event(event, text="", error="1") @@ -44,7 +44,7 @@ def clean_categories(event): try: newtext = to_category(event.text, category) if newtext != "error": - return event.from_event(event, text=f"xCATx{newtext}", error='0') + return event.from_event(event, text=newtext, error='0') return event.from_event(event, text="", error="1") except (AttributeError, TypeError, ValueError): return event.from_event(event, text="", error="1") \ No newline at end of file diff --git a/liiatools/spec/social_work_workforce/la-agg.yml b/liiatools/spec/social_work_workforce/la-agg.yml index d3b909ff..3d64fe54 100644 --- a/liiatools/spec/social_work_workforce/la-agg.yml +++ b/liiatools/spec/social_work_workforce/la-agg.yml @@ -39,6 +39,9 @@ sort_order: dedup: CSWWWorker: + - AgencyWorker + - SWENo + - FTE - PersonBirthDate - GenderCurrent - Ethnicity From 0a54643d3c2d595062f2ac8d7deff577bc42154c Mon Sep 17 00:00:00 2001 From: patrick-troy <58770937+patrick-troy@users.noreply.github.com> Date: Fri, 7 Jul 2023 13:43:46 +0100 Subject: [PATCH 22/40] add remaining schema_dicts --- .../csdatatools/datasets/cincensus/filters.py | 90 +++++++++++++++++-- .../csww_main_functions.py | 14 ++- 2 files changed, 95 insertions(+), 9 deletions(-) diff --git a/liiatools/csdatatools/datasets/cincensus/filters.py b/liiatools/csdatatools/datasets/cincensus/filters.py index c6d7f313..87bedf33 100644 --- a/liiatools/csdatatools/datasets/cincensus/filters.py +++ b/liiatools/csdatatools/datasets/cincensus/filters.py @@ -89,8 +89,64 @@ def _create_category_dict(field: str, file: str): return +def _create_float_dict(field: str, file: str): + float_dict = None + + xsd_xml = ET.parse(file) + search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']" + element = xsd_xml.find(search_elem) + + search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction" + restriction = element.findall(search_restriction) + for r in restriction: + code_dict = {"numeric": r.get("base")[3:]} # Remove the "xs:" from the start of the base string + if code_dict["numeric"] == "decimal": + float_dict = code_dict + + search_fraction_digits = f".//{{http://www.w3.org/2001/XMLSchema}}fractionDigits" + fraction_digits = element.findall(search_fraction_digits) + for f in fraction_digits: + fraction_digits_dict = {"fixed": f.get("fixed"), "decimal": f.get("value")} + float_dict = {**float_dict, **fraction_digits_dict} + + search_min_inclusive = f".//{{http://www.w3.org/2001/XMLSchema}}minInclusive" + min_inclusive = element.findall(search_min_inclusive) + for m in min_inclusive: + min_dict = {"min_inclusive": m.get("value")} + float_dict = {**float_dict, **min_dict} + + search_max_inclusive = f".//{{http://www.w3.org/2001/XMLSchema}}maxInclusive" + max_inclusive = element.findall(search_max_inclusive) + for m in max_inclusive: + max_dict = {"max_inclusive": m.get("value")} + float_dict = {**float_dict, **max_dict} + + return float_dict + + +def _create_regex_dict(field: str, file: str): + regex_dict = None + + xsd_xml = ET.parse(file) + search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']" + element = xsd_xml.find(search_elem) + + search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction" + restriction = element.findall(search_restriction) + for r in restriction: + if r.get("base") == "xs:string": + regex_dict = {"regex_string": None} + + search_pattern = f".//{{http://www.w3.org/2001/XMLSchema}}pattern" + pattern = element.findall(search_pattern) + for p in pattern: + regex_dict["regex_string"] = p.get("value") + + return regex_dict + + @streamfilter() -def add_schema(event, schema: xmlschema.XMLSchema, schema_path: str): +def add_schema(event, schema: xmlschema.XMLSchema): """ Requires each event to have event.context as set by :func:`add_context` @@ -100,7 +156,6 @@ def add_schema(event, schema: xmlschema.XMLSchema, schema_path: str): Provides: path, schema """ - schema_dict = None assert ( event.context ), "This filter required event.context to be set - see add_context" @@ -108,10 +163,35 @@ def add_schema(event, schema: xmlschema.XMLSchema, schema_path: str): tag = event.context[-1] el = schema.get_element(tag, path) - if el.type.name is not None and el.type.name[-4:] == "type": - schema_dict = _create_category_dict(el.type.name, schema_path) + return event.from_event(event, path=path, schema=el) + + +@streamfilter(check=type_check(events.TextNode), fail_function=pass_event) +def add_schema_dict(event, schema_path: str): + schema_dict = None - return event.from_event(event, path=path, schema=el, schema_dict=schema_dict) + config_type = event.schema.type.name + if config_type is not None: + if config_type[-4:] == "type": + schema_dict = _create_category_dict(config_type, schema_path) + if config_type in ["onedecimalplace", "twodecimalplaces", "ftetype"]: + schema_dict = _create_float_dict(config_type, schema_path) + if config_type in ["swetype"]: + schema_dict = _create_regex_dict(config_type, schema_path) + if config_type == "{http://www.w3.org/2001/XMLSchema}date": + schema_dict = {"date": "%d/%m/%Y"} + if config_type == "{http://www.w3.org/2001/XMLSchema}integer": + schema_dict = {"numeric": "integer"} + if config_type == "{http://www.w3.org/2001/XMLSchema}string": + schema_dict = {"string": "alphanumeric"} + + if schema_dict is not None: + if event.schema.occurs[0] == 0: + schema_dict = {**schema_dict, **{"canbeblank": "yes"}} + elif event.schema.occurs[0] == 1: + schema_dict = {**schema_dict, **{"canbeblank": "no"}} + + return event.from_event(event, schema_dict=schema_dict) def _get_validation_error(schema, node) -> XMLSchemaValidatorError: diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 86ac784c..4f2e3e93 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -9,6 +9,7 @@ from liiatools.csdatatools.util.xml import etree, to_xml, dom_parse # Dependencies for cleanfile() +from liiatools.csdatatools.util.xml import dom_parse from liiatools.csdatatools.datasets.cincensus import filters from liiatools.datasets.social_work_workforce.lds_csww_clean.schema import ( Schema, @@ -126,15 +127,13 @@ def cleanfile(input, la_code, la_log_dir, output): la_name = flip_dict(config["data_codes"])[la_code] stream = filters.strip_text(stream) stream = filters.add_context(stream) - stream = filters.add_schema( - stream, schema=Schema(input_year).schema, schema_path=FilePath(input_year).path - ) + stream = filters.add_schema(stream, schema=Schema(input_year).schema) + stream = filters.add_schema_dict(stream, schema_path=FilePath(input_year).path) # Clean stream stream = cleaner.clean_categories(stream) stream = cleaner.clean_dates(stream) - # Output results stream = csww_record.message_collector(stream) @@ -227,6 +226,13 @@ def pan_agg(input, la_code, output): # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", # ) +cleanfile( + r"C:\Users\patrick.troy\Downloads\LIIA tests\social_work_workforce_2022.xml", + "NEW", + r"C:\Users\patrick.troy\Downloads\LIIA tests", + r"C:\Users\patrick.troy\Downloads\LIIA tests", +) + # la_agg( # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", From 0ad5deb58629897faf5d4af2e018839cecdbf55d Mon Sep 17 00:00:00 2001 From: patrick-troy <58770937+patrick-troy@users.noreply.github.com> Date: Fri, 7 Jul 2023 15:50:59 +0100 Subject: [PATCH 23/40] add logger functionality --- .../csww_main_functions.py | 19 +- .../lds_csww_clean/filters.py | 262 ++++++++++++++++++ .../lds_csww_clean/logger.py | 225 +++++++++++++++ .../lds_csww_clean/xml.py | 49 ++++ 4 files changed, 544 insertions(+), 11 deletions(-) create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/xml.py diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 4f2e3e93..5f4a0e01 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -6,11 +6,13 @@ generate_sample_csww_file, ) from liiatools.csdatatools.util.stream import consume -from liiatools.csdatatools.util.xml import etree, to_xml, dom_parse # Dependencies for cleanfile() -from liiatools.csdatatools.util.xml import dom_parse -from liiatools.csdatatools.datasets.cincensus import filters +from liiatools.datasets.social_work_workforce.lds_csww_clean.xml import ( + etree, + to_xml, + dom_parse +) from liiatools.datasets.social_work_workforce.lds_csww_clean.schema import ( Schema, FilePath, @@ -21,6 +23,8 @@ configuration as clean_config, csww_record, cleaner, + logger, + filters, ) from liiatools.spec import common as common_asset_dir @@ -102,7 +106,6 @@ def cleanfile(input, la_code, la_log_dir, output): ): return stream = dom_parse(input) - stream = list(stream) # Get year from input file try: @@ -133,6 +136,7 @@ def cleanfile(input, la_code, la_log_dir, output): # Clean stream stream = cleaner.clean_categories(stream) stream = cleaner.clean_dates(stream) + stream = logger.log_errors(stream) # Output results stream = csww_record.message_collector(stream) @@ -226,13 +230,6 @@ def pan_agg(input, la_code, output): # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", # ) -cleanfile( - r"C:\Users\patrick.troy\Downloads\LIIA tests\social_work_workforce_2022.xml", - "NEW", - r"C:\Users\patrick.troy\Downloads\LIIA tests", - r"C:\Users\patrick.troy\Downloads\LIIA tests", -) - # la_agg( # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py new file mode 100644 index 00000000..06fb8798 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py @@ -0,0 +1,262 @@ +import logging +from typing import List +import xml.etree.ElementTree as ET +import xmlschema +from xmlschema import XMLSchemaValidatorError + +from sfdata_stream_parser.checks import type_check +from sfdata_stream_parser import events +from sfdata_stream_parser.collectors import collector, block_check +from sfdata_stream_parser.filters.generic import streamfilter, pass_event + +log = logging.getLogger(__name__) + + +@streamfilter(default_args=lambda: {"context": []}) +def add_context(event, context: List[str]): + """ + Adds 'context' to XML structures. For each :class:`sfdata_stream_parser.events.StartElement` the tag name is + added to a 'context' tuple, and for each :class:`sfdata_stream_parser.events.EndElement` the context is popped. + + For all other events, the context tuple is set as-is. + + Provides: context + """ + if isinstance(event, events.StartElement): + context.append(event.tag) + local_context = tuple(context) + elif isinstance(event, events.EndElement): + local_context = tuple(context) + context.pop() + else: + local_context = tuple(context) + return event.from_event(event, context=local_context) + + +@streamfilter(check=type_check(events.TextNode), fail_function=pass_event) +def strip_text(event): + """ + Strips surrounding whitespaces from :class:`sfdata_stream_parser.events.TextNode`. If the event does + not have a text property then this filter fails silently. + """ + if not hasattr(event, "text"): + return event + + if event.text is None: + return event + + text = event.text.strip() + return event.from_event(event, text=text) + + +def _create_category_dict(field: str, file: str): + """ + Create a dictionary containing the different categorical values of a given field to conform categories + e.g. {'category': [{'code': '0', 'name': 'Not an Agency Worker'}, {'code': '1', 'name': 'Agency Worker'}]} + + :param field: Name of the categorical field you want to find the values for + :param file: Path to the .xsd schema containing possible categories + :return: Dictionary of categorical values and potential alternatives + """ + category_dict = {"category": []} + + xsd_xml = ET.parse(file) + search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']" + element = xsd_xml.find(search_elem) + + if element is not None: + search_value = f".//{{http://www.w3.org/2001/XMLSchema}}enumeration" + value = element.findall(search_value) + if value: + for v in value: + code_dict = {"code": v.get("value")} + category_dict["category"].append(code_dict) + + search_doc = f".//{{http://www.w3.org/2001/XMLSchema}}documentation" + documentation = element.findall(search_doc) + for i, d in enumerate(documentation): + name_dict = {"name": d.text} + category_dict["category"][i] = {**category_dict["category"][i], **name_dict} + + return category_dict + + else: + return + + +def _create_float_dict(field: str, file: str): + float_dict = None + + xsd_xml = ET.parse(file) + search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']" + element = xsd_xml.find(search_elem) + + search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction" + restriction = element.findall(search_restriction) + for r in restriction: + code_dict = {"numeric": r.get("base")[3:]} # Remove the "xs:" from the start of the base string + if code_dict["numeric"] == "decimal": + float_dict = code_dict + + search_fraction_digits = f".//{{http://www.w3.org/2001/XMLSchema}}fractionDigits" + fraction_digits = element.findall(search_fraction_digits) + for f in fraction_digits: + fraction_digits_dict = {"fixed": f.get("fixed"), "decimal": f.get("value")} + float_dict = {**float_dict, **fraction_digits_dict} + + search_min_inclusive = f".//{{http://www.w3.org/2001/XMLSchema}}minInclusive" + min_inclusive = element.findall(search_min_inclusive) + for m in min_inclusive: + min_dict = {"min_inclusive": m.get("value")} + float_dict = {**float_dict, **min_dict} + + search_max_inclusive = f".//{{http://www.w3.org/2001/XMLSchema}}maxInclusive" + max_inclusive = element.findall(search_max_inclusive) + for m in max_inclusive: + max_dict = {"max_inclusive": m.get("value")} + float_dict = {**float_dict, **max_dict} + + return float_dict + + +def _create_regex_dict(field: str, file: str): + regex_dict = None + + xsd_xml = ET.parse(file) + search_elem = f".//{{http://www.w3.org/2001/XMLSchema}}simpleType[@name='{field}']" + element = xsd_xml.find(search_elem) + + search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction" + restriction = element.findall(search_restriction) + for r in restriction: + if r.get("base") == "xs:string": + regex_dict = {"regex_string": None} + + search_pattern = f".//{{http://www.w3.org/2001/XMLSchema}}pattern" + pattern = element.findall(search_pattern) + for p in pattern: + regex_dict["regex_string"] = p.get("value") + + return regex_dict + + +@streamfilter() +def add_schema(event, schema: xmlschema.XMLSchema): + """ + Requires each event to have event.context as set by :func:`add_context` + + Based on the context (a tuple of element tags) it will set path which is the + derived path (based on the context tags) joined by '/' and schema holding the + corresponding schema element, if found. + + Provides: path, schema + """ + assert ( + event.context + ), "This filter required event.context to be set - see add_context" + path = "/".join(event.context) + tag = event.context[-1] + el = schema.get_element(tag, path) + + return event.from_event(event, path=path, schema=el) + + +@streamfilter(check=type_check(events.TextNode), fail_function=pass_event) +def add_schema_dict(event, schema_path: str): + schema_dict = None + + config_type = event.schema.type.name + if config_type is not None: + if config_type[-4:] == "type": + schema_dict = _create_category_dict(config_type, schema_path) + if config_type in ["onedecimalplace", "twodecimalplaces", "ftetype"]: + schema_dict = _create_float_dict(config_type, schema_path) + if config_type in ["swetype"]: + schema_dict = _create_regex_dict(config_type, schema_path) + if config_type == "{http://www.w3.org/2001/XMLSchema}date": + schema_dict = {"date": "%Y/%m/%d"} + if config_type == "{http://www.w3.org/2001/XMLSchema}integer": + schema_dict = {"numeric": "integer"} + if config_type == "{http://www.w3.org/2001/XMLSchema}string": + schema_dict = {"string": "alphanumeric"} + + if schema_dict is not None: + if event.schema.occurs[0] == 0: + schema_dict = {**schema_dict, **{"canbeblank": True}} + elif event.schema.occurs[0] == 1: + schema_dict = {**schema_dict, **{"canbeblank": False}} + + return event.from_event(event, schema_dict=schema_dict) + + +def _get_validation_error(schema, node) -> XMLSchemaValidatorError: + try: + schema.validate(node) + return None + except XMLSchemaValidatorError as e: + return e + + +@streamfilter(check=type_check(events.StartElement), fail_function=pass_event) +def validate_elements(event): + """ + Validates each element, and if not valid, sets the properties: + + * valid - (always False) + * validation_message - a descriptive validation message + """ + validation_error = _get_validation_error(event.schema, event.node) + if validation_error is None: + return event + + message = ( + validation_error.reason + if hasattr(validation_error, "reason") + else validation_error.message + ) + return events.StartElement.from_event( + event, valid=False, validation_message=message + ) + + +@streamfilter(check=type_check(events.StartElement), fail_function=pass_event) +def prop_to_attribute(event, prop_name): + """ + Elevates an event property to an XML attribute. + """ + if hasattr(event, prop_name): + attrs = getattr(event, "attrs", {}) + attrs[prop_name] = getattr(event, prop_name) + return events.StartElement.from_event(event, attrs=attrs) + else: + return event + + +@collector(check=block_check(events.StartElement), receive_stream=True) +def remove_invalid(stream, tag_name): + """ + Filters out events with the given tag name if they are not valid + """ + stream = list(stream) + first = stream[0] + last = stream[-1] + stream = stream[1:-1] + + if first.tag == tag_name and not getattr(first, "valid", True): + yield from [] + else: + yield first + + if len(stream) > 0: + yield from remove_invalid(stream, tag_name=tag_name) + + yield last + + +@streamfilter(check=lambda x: True) +def counter(event, counter_check, context): + if counter_check(event): + context["pass"] += 1 + else: + context["fail"] += 1 + return event diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py new file mode 100644 index 00000000..1fc9cf88 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py @@ -0,0 +1,225 @@ +from collections import Counter +from datetime import datetime +import logging +import os + +from sfdata_stream_parser import events +from sfdata_stream_parser.filters.generic import streamfilter, pass_event +from sfdata_stream_parser.checks import type_check + +log = logging.getLogger(__name__) + + +class ErrorTable(events.ParseEvent): + pass + + +def create_formatting_error_count(stream): + """ + Create a list of the column headers for cells with formatting errors (event.error = 1) for each table + + :param stream: A filtered list of event objects + :return: An updated list of event objects with error counts + """ + formatting_error_count = None + for event in stream: + if isinstance(event, events.StartElement) and event.tag == "LALevelVacancies": + formatting_error_count = [] + elif isinstance(event, events.EndElement) and event.tag == "Message": + yield ErrorTable.from_event( + event, + formatting_error_count=formatting_error_count, + ) + formatting_error_count = None + elif ( + formatting_error_count is not None + and isinstance(event, events.TextNode) + ): + try: + if event.error == "1": + formatting_error_count.append(event.schema.name) + except AttributeError: # Raised in case there is no event.error + pass + yield event + + +@streamfilter( + check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event +) +def blank_error_check(event): + """ + Check all the values against the config to see if they are allowed to be blank + if they are blank but should not be, record this as event.blank_error = 1 + + :param event: A filtered list of event objects of type Cell + :return: An updated list of event objects + """ + try: + allowed_blank = event.schema_dict["canbeblank"] + error = getattr(event, "error", "0") + if not allowed_blank and not event.text and error != "1": + return event.from_event(event, blank_error="1") + else: + return event + except AttributeError: # Raised in case there is no config item for the given cell + pass + + +def create_blank_error_count(stream): + """ + Create a list of the column headers for cells with blank fields that should not be blank (event.blank_error = 1) + for each table + + :param stream: A filtered list of event objects + :return: An updated list of event objects + """ + blank_error_count = None + for event in stream: + if isinstance(event, events.StartTable): + blank_error_count = [] + elif isinstance(event, events.EndTable): + blank_error_count = None + elif isinstance(event, ErrorTable): + yield ErrorTable.from_event(event, blank_error_count=blank_error_count) + blank_error_count = None + elif blank_error_count is not None and isinstance(event, events.Cell): + try: + if event.blank_error == "1": + blank_error_count.append(event.header) + except AttributeError: + pass + yield event + + +@streamfilter( + check=type_check(events.StartTable), + fail_function=pass_event, + error_function=pass_event, +) +def create_file_match_error(event): + """ + Add a match_error to StartTables that do not have an event.sheet_name so these errors can be written to the log.txt + file. If there is no event.sheet_name for a given StartTable that means its headers did not match any of those + in the config file + + :param event: A filtered list of event objects of type StartTable + :return: An updated list of event objects + """ + try: + if event.table_name: + return event + except AttributeError: + return event.from_event( + event, + match_error=f"Failed to find a set of matching columns headers for file titled " + f"'{event.filename}' which contains column headers {event.headers} so no output has been produced", + ) + return event + + +@streamfilter( + check=type_check(events.StartTable), + fail_function=pass_event, + error_function=pass_event, +) +def create_extra_column_error(event): + """ + Add a extra_column_error to StartTables that have more columns than the set of expected columns so these can be written to the log.txt + + :param event: A filtered list of event objects of type StartTable + :return: An updated list of event objects + """ + extra_columns = [ + item for item in event.headers if item not in event.expected_columns + ] + if len(extra_columns) == 0: + return event + else: + return event.from_event( + event, + extra_column_error=f"Additional columns were found in file titled " + f"'{event.filename}' than those expected from schema for filetype = {event.table_name}, so these columns have been removed: {extra_columns}", + ) + + +def save_errors_la(stream, la_log_dir): + """ + Count the error events and save them as a text file in the Local Authority Logs directory + only save the error events if there is at least one error in said event + + :param stream: A filtered list of event objects + :param la_log_dir: Location to save the gathered error logs + :return: An updated list of event objects + """ + start_time = f"{datetime.now():%Y-%m-%dT%H%M%SZ}" + for event in stream: + try: + if isinstance(event, ErrorTable) and ( + event.formatting_error_count is not None + and event.blank_error_count is not None + and event.table_name is not None + ): + if event.formatting_error_count or event.blank_error_count: + with open( + f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", + "a", + ) as f: + f.write(event.table_name) + f.write("\n") + if event.formatting_error_count: + f.write( + "Number of cells that have been made blank " + "because they could not be formatted correctly" + ) + f.write("\n") + counter_dict = Counter(event.formatting_error_count) + f.write( + str(counter_dict)[9:-2] + ) # Remove "Counter({" and "})" from string + f.write("\n") + if event.blank_error_count: + f.write( + "Number of blank cells that should have contained data" + ) + f.write("\n") + blank_counter_dict = Counter(event.blank_error_count) + f.write( + str(blank_counter_dict)[9:-2] + ) # Remove "Counter({" and "})" from string + f.write("\n") + except AttributeError: + pass + + if isinstance(event, events.StartTable): + match_error = getattr(event, "match_error", None) + if match_error: + with open( + f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", + "a", + ) as f: + f.write(match_error) + f.write("\n") + column_error = getattr(event, "extra_column_error", None) + if column_error: + with open( + f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", + "a", + ) as f: + f.write(column_error) + f.write("\n") + yield event + + +def log_errors(stream): + """ + Compile the log error functions + + :param stream: A filtered list of event objects + :return: An updated list of event objects + """ + stream = blank_error_check(stream) + # stream = create_formatting_error_count(stream) + # stream = create_blank_error_count(stream) + # stream = create_file_match_error(stream) + # stream = create_extra_column_error(stream) + return stream diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/xml.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/xml.py new file mode 100644 index 00000000..58bc03fc --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/xml.py @@ -0,0 +1,49 @@ +from sfdata_stream_parser.events import ( + StartElement, + EndElement, + TextNode, + CommentNode, + ProcessingInstructionNode, +) + +try: + from lxml import etree +except ImportError: + pass + + +def dom_parse(source, **kwargs): + """ + Equivalent of the xml parse included in the sfdata_stream_parser package, but uses the ET DOM + and allows direct DOM manipulation. + """ + parser = etree.iterparse(source, events=("start", "end", "comment", "pi"), **kwargs) + for action, elem in parser: + if action == "start": + yield StartElement(tag=elem.tag, attrib=elem.attrib, node=elem) + yield TextNode(text=elem.text) + elif action == "end": + yield EndElement(tag=elem.tag, node=elem) + if elem.tail: + yield TextNode(text=elem.tail) + elif action == "comment": + yield CommentNode(text=elem.text, node=elem) + elif action == "pi": + yield ProcessingInstructionNode(name=elem.target, text=elem.text, node=elem) + else: + raise ValueError(f"Unknown event: {action}") + + +def to_xml(stream, builder: etree.TreeBuilder): + for ev in stream: + if isinstance(ev, StartElement): + builder.start(ev.tag, getattr(ev, "attrs", {})) + elif isinstance(ev, EndElement): + builder.end(ev.tag) + elif isinstance(ev, TextNode): + builder.data(ev.text) + elif isinstance(ev, CommentNode): + builder.comment(ev.text) + elif isinstance(ev, ProcessingInstructionNode): + builder.pi(ev.name, ev.text) + yield ev From 028e589fc2dddc7da2a5cafa70f25862d08adc3c Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Fri, 7 Jul 2023 16:20:25 +0000 Subject: [PATCH 24/40] Work in progress --- .../csdatatools/datasets/cincensus/filters.py | 2 +- .../csww_main_functions.py | 28 +++++---- .../lds_csww_clean/cleaner.py | 48 +++++++++++++-- .../lds_csww_clean/converters.py | 60 +++++++++++++++++++ 4 files changed, 120 insertions(+), 18 deletions(-) create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py diff --git a/liiatools/csdatatools/datasets/cincensus/filters.py b/liiatools/csdatatools/datasets/cincensus/filters.py index 87bedf33..fe3d48e5 100644 --- a/liiatools/csdatatools/datasets/cincensus/filters.py +++ b/liiatools/csdatatools/datasets/cincensus/filters.py @@ -179,7 +179,7 @@ def add_schema_dict(event, schema_path: str): if config_type in ["swetype"]: schema_dict = _create_regex_dict(config_type, schema_path) if config_type == "{http://www.w3.org/2001/XMLSchema}date": - schema_dict = {"date": "%d/%m/%Y"} + schema_dict = {"date": "%Y-%m-%d"} if config_type == "{http://www.w3.org/2001/XMLSchema}integer": schema_dict = {"numeric": "integer"} if config_type == "{http://www.w3.org/2001/XMLSchema}string": diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 4f2e3e93..11189a99 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -130,9 +130,13 @@ def cleanfile(input, la_code, la_log_dir, output): stream = filters.add_schema(stream, schema=Schema(input_year).schema) stream = filters.add_schema_dict(stream, schema_path=FilePath(input_year).path) + # for e in stream: + # print(e.get('schema_dict')) # Clean stream stream = cleaner.clean_categories(stream) stream = cleaner.clean_dates(stream) + stream = cleaner.clean_numeric(stream) # TODO: implement function for decimal (works for integer) + stream = cleaner.clean_regex_string(stream) # TODO: implement function # Output results stream = csww_record.message_collector(stream) @@ -214,24 +218,24 @@ def pan_agg(input, la_code, output): # Run in Visual Studio Code |> -# cleanfile( -# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", -# "BAD", -# "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# ) +cleanfile( + "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", + "BAD", + "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +) # la_agg( # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", # ) -cleanfile( - r"C:\Users\patrick.troy\Downloads\LIIA tests\social_work_workforce_2022.xml", - "NEW", - r"C:\Users\patrick.troy\Downloads\LIIA tests", - r"C:\Users\patrick.troy\Downloads\LIIA tests", -) +# cleanfile( +# r"C:\Users\patrick.troy\Downloads\LIIA tests\social_work_workforce_2022.xml", +# "NEW", +# r"C:\Users\patrick.troy\Downloads\LIIA tests", +# r"C:\Users\patrick.troy\Downloads\LIIA tests", +# ) # la_agg( # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index b85b5da1..1dfc7bae 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -4,7 +4,7 @@ from sfdata_stream_parser import events from sfdata_stream_parser.filters.generic import streamfilter, pass_event -from liiatools.datasets.s903.lds_ssda903_clean.converters import to_category, to_integer +from liiatools.datasets.social_work_workforce.lds_csww_clean.converters import to_category, to_integer, to_decimal from liiatools.datasets.shared_functions.converters import to_date #from liiatools.datasets.shared_functions.common import check_postcode @@ -22,9 +22,9 @@ def clean_dates(event): :param event: A filtered list of event objects of type text :return: An updated list of event objects """ - date = event.schema_dict["date"] + dateformat = event.schema_dict["date"] try: - newtext = to_date(event.text, date) + newtext = to_date(event.text, dateformat) return event.from_event(event, text=newtext, error="0") except (AttributeError, TypeError, ValueError): return event.from_event(event, text="", error="1") @@ -44,7 +44,45 @@ def clean_categories(event): try: newtext = to_category(event.text, category) if newtext != "error": - return event.from_event(event, text=newtext, error='0') + return event.from_event(event, text=f"*cat*{newtext}", error='0') return event.from_event(event, text="", error="1") except (AttributeError, TypeError, ValueError): - return event.from_event(event, text="", error="1") \ No newline at end of file + return event.from_event(event, text="", error="1") + + +@streamfilter( + check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event +) +def clean_numeric(event): + """ + Convert all values that should be integers to integers based on the schema xsd file + + :param event: A filtered list of event objects of type integer + :return: An updated list of event objects + """ + numeric = event.schema_dict["numeric"] + #print(integer) + try: + if numeric == "integer": + newtext = to_integer(event.text, numeric) + elif numeric == "decimal": + newtext = to_decimal(event.text, numeric) + if newtext != "error": + return event.from_event(event, text=f"*{numeric[:3]}*{newtext}", error='0') + return event.from_event(event, text="", error="1") + except (AttributeError, TypeError, ValueError): + return event.from_event(event, text="", error="1") + + +@streamfilter( + check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event +) +def clean_regex_string(event): + """ + Convert all values that should be regex strings to regex strings based on the schema xsd file + + :param event: A filtered list of event objects of type regex string + :return: An updated list of event objects + """ + pattern = event.schema_dict["regex_string"] + print(pattern) \ No newline at end of file diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py new file mode 100644 index 00000000..c09c6bd5 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py @@ -0,0 +1,60 @@ +import logging + +log = logging.getLogger(__name__) + + +def to_category(string, categories): + """ + Matches a string to a category based on categories given in a config file + the config file should contain a dictionary for each category for this function to loop through + return blank if no categories found + + :param string: Some string to convert into a category value + :param categories: A list of dictionaries containing different category:value pairs + :return: Either a category value, "error" or blank string + """ + for code in categories: + if str(string).lower() == str(code["code"]).lower(): + return code["code"] + if ( + str(string).lower() == str(code["code"]).lower() + ".0" + ): # In case integers are read as floats + return code["code"] + elif "name" in code: + if str(code["name"]).lower() in str(string).lower(): + return code["code"] + elif not string: + return "" + elif not string: + return "" + return "error" + + +def to_integer(value, config): + """ + Convert any strings that should be integers based on the config into integers + + :param value: Some value to convert to an integer + :param config: The loaded configuration + :return: Either an integer value or a blank string + """ + if config == "integer": + if isinstance(value, str) and value[-2:] == ".0": + return int(float(value)) + elif value or value == 0: + return int(value) + else: + return "" + else: + return value + + +def to_decimal(value, config): + """ + Convert any strings that should be decimal based on the config into decimals + + :param value: Some value to convert to a decimal + :param config: The loaded configuration + :return: Either a decimal value or a blank string + """ + pass From 4de8ea4bb0fadc1f6b131d2d6c0d62bb5080f46f Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 10 Jul 2023 12:07:39 +0000 Subject: [PATCH 25/40] Implement cleaner and unit test --- .../csww_main_functions.py | 4 +- .../lds_csww_clean/cleaner.py | 31 +++++---- .../lds_csww_clean/converters.py | 52 ++++++++++++--- .../lds_csww_clean/file_creator.py | 2 +- .../lds_csww_clean/filters.py | 2 +- .../social_work_workforce/test_converters.py | 65 +++++++++++++++++++ 6 files changed, 131 insertions(+), 25 deletions(-) create mode 100644 tests/social_work_workforce/test_converters.py diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index b614074c..438574e4 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -138,8 +138,8 @@ def cleanfile(input, la_code, la_log_dir, output): # Clean stream stream = cleaner.clean_categories(stream) stream = cleaner.clean_dates(stream) - stream = cleaner.clean_numeric(stream) # TODO: implement function for decimal (works for integer) - stream = cleaner.clean_regex_string(stream) # TODO: implement function + stream = cleaner.clean_numeric(stream) + stream = cleaner.clean_regex_string(stream) stream = logger.log_errors(stream) # Output results diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index 1dfc7bae..1948265d 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -4,7 +4,7 @@ from sfdata_stream_parser import events from sfdata_stream_parser.filters.generic import streamfilter, pass_event -from liiatools.datasets.social_work_workforce.lds_csww_clean.converters import to_category, to_integer, to_decimal +from liiatools.datasets.social_work_workforce.lds_csww_clean.converters import to_category, to_integer, to_decimal, to_regex from liiatools.datasets.shared_functions.converters import to_date #from liiatools.datasets.shared_functions.common import check_postcode @@ -19,7 +19,7 @@ def clean_dates(event): """ Convert all values that should be dates to dates based on the schema xsd file - :param event: A filtered list of event objects of type text + :param event: A filtered list of event objects of type TextNode :return: An updated list of event objects """ dateformat = event.schema_dict["date"] @@ -37,14 +37,14 @@ def clean_categories(event): """ Convert all values that should be categories to categories based on the schema xsd file - :param event: A filtered list of event objects of type text + :param event: A filtered list of event objects of type TextNode :return: An updated list of event objects """ category = event.schema_dict["category"] try: newtext = to_category(event.text, category) if newtext != "error": - return event.from_event(event, text=f"*cat*{newtext}", error='0') + return event.from_event(event, text=newtext, error='0') return event.from_event(event, text="", error="1") except (AttributeError, TypeError, ValueError): return event.from_event(event, text="", error="1") @@ -55,20 +55,21 @@ def clean_categories(event): ) def clean_numeric(event): """ - Convert all values that should be integers to integers based on the schema xsd file + Convert all values that should be integers or decimals to integers or decimals based on the schema xsd file - :param event: A filtered list of event objects of type integer + :param event: A filtered list of event objects of type TextNode :return: An updated list of event objects """ numeric = event.schema_dict["numeric"] - #print(integer) try: if numeric == "integer": newtext = to_integer(event.text, numeric) elif numeric == "decimal": - newtext = to_decimal(event.text, numeric) + #print(event.schema_dict["fixed"], event.schema_dict["decimal"]) + decimalplaces = int(event.schema_dict["decimal"]) + newtext = to_decimal(event.text, numeric, decimalplaces) if newtext != "error": - return event.from_event(event, text=f"*{numeric[:3]}*{newtext}", error='0') + return event.from_event(event, text=newtext, error='0') return event.from_event(event, text="", error="1") except (AttributeError, TypeError, ValueError): return event.from_event(event, text="", error="1") @@ -76,13 +77,19 @@ def clean_numeric(event): @streamfilter( check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event -) +) def clean_regex_string(event): """ Convert all values that should be regex strings to regex strings based on the schema xsd file - :param event: A filtered list of event objects of type regex string + :param event: A filtered list of event objects of type TextNode :return: An updated list of event objects """ pattern = event.schema_dict["regex_string"] - print(pattern) \ No newline at end of file + try: + newtext = to_regex(event.text, pattern) + if newtext != "error": + return event.from_event(event, text=newtext, error="0") + return event.from_event(event, text="", error="1") + except (AttributeError, TypeError, ValueError): + return event.from_event(event, text="", error="1") \ No newline at end of file diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py index c09c6bd5..4cd72810 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py @@ -1,12 +1,13 @@ import logging +import re log = logging.getLogger(__name__) def to_category(string, categories): """ - Matches a string to a category based on categories given in a config file - the config file should contain a dictionary for each category for this function to loop through + Matches a string to a category based on categories given in a schema file + the schema file should contain a dictionary for each category for this function to loop through return blank if no categories found :param string: Some string to convert into a category value @@ -18,12 +19,12 @@ def to_category(string, categories): return code["code"] if ( str(string).lower() == str(code["code"]).lower() + ".0" - ): # In case integers are read as floats + ): # In case integers are read as floats return code["code"] - elif "name" in code: + if "name" in code: if str(code["name"]).lower() in str(string).lower(): return code["code"] - elif not string: + if not string: return "" elif not string: return "" @@ -47,14 +48,47 @@ def to_integer(value, config): return "" else: return value - -def to_decimal(value, config): + +def to_decimal(value, config, decplaces=0): """ Convert any strings that should be decimal based on the config into decimals :param value: Some value to convert to a decimal :param config: The loaded configuration - :return: Either a decimal value or a blank string + :param decplaces: The number of decimal places + :return: Either a decimal value formatted to number of decimal places or a blank string + """ + dpdisplayformat= f".{decplaces}f" + if config == "decimal": + try: + float(value) + roundtodp = round(float(value), decplaces) + return f"{roundtodp: {dpdisplayformat}}".strip() + except (ValueError, TypeError): + return "" + return value + + +def to_regex(value, pattern): """ - pass + Convert any strings that should conform to regex pattern based on the schema into regex string + + :param value: Some value to convert to a regex string + :param pattern: The regex pattern to compare + :return: Either a regex string or a blank string + """ + if pattern: + if value: + try: + isfullmatch = re.fullmatch(pattern, value) + if isfullmatch: + return value + else: + return "" + except (ValueError, TypeError): + return "" + else: + return "" + else: + return value \ No newline at end of file diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py index 66fe79a0..38da3243 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py @@ -48,7 +48,7 @@ def degrade_SWENo(data): """ if "SWENo" in data: if data["SWENo"] is not None: - data["SWENo"] = data["SWENo"].apply(lambda row: swe_hash(row)) + data["SWENo"] = data["SWENo"].apply(lambda row: swe_hash(row) if row else row ) return data diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py index 06fb8798..eafb9297 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py @@ -174,7 +174,7 @@ def add_schema_dict(event, schema_path: str): if config_type in ["swetype"]: schema_dict = _create_regex_dict(config_type, schema_path) if config_type == "{http://www.w3.org/2001/XMLSchema}date": - schema_dict = {"date": "%Y/%m/%d"} + schema_dict = {"date": "%Y-%m-%d"} if config_type == "{http://www.w3.org/2001/XMLSchema}integer": schema_dict = {"numeric": "integer"} if config_type == "{http://www.w3.org/2001/XMLSchema}string": diff --git a/tests/social_work_workforce/test_converters.py b/tests/social_work_workforce/test_converters.py new file mode 100644 index 00000000..d5d2bce2 --- /dev/null +++ b/tests/social_work_workforce/test_converters.py @@ -0,0 +1,65 @@ +from liiatools.datasets.social_work_workforce.lds_csww_clean import converters + + +def test_to_category(): + category_dict = [ + {"code": "M1"}, + {"code": "F1"}, + {"code": "MM"}, + {"code": "FF"}, + {"code": "MF"}, + ] + assert converters.to_category("M1", category_dict) == "M1" + assert converters.to_category("M2", category_dict) == "error" + assert converters.to_category("MF", category_dict) == "MF" + assert converters.to_category("", category_dict) == "" + assert converters.to_category(None, category_dict) == "" + + category_dict = [{"code": "0", "name": "False"}, {"code": "1", "name": "True"}] + assert converters.to_category(0, category_dict) == "0" + assert converters.to_category("false", category_dict) == "0" + assert converters.to_category(1.0, category_dict) == "1" + assert converters.to_category("true", category_dict) == "1" + assert converters.to_category("string", category_dict) == "error" + assert converters.to_category(123, category_dict) == "error" + assert converters.to_category("", category_dict) == "" + assert converters.to_category(None, category_dict) == "" + + +def test_to_integer(): + assert converters.to_integer("3000", "integer") == 3000 + assert converters.to_integer(123, "integer") == 123 + assert converters.to_integer("1.0", "integer") == 1 + assert converters.to_integer("date", "") == "date" + assert converters.to_integer(0, "integer") == 0 + assert converters.to_integer("", "integer") == "" + assert converters.to_integer(None, "integer") == "" + + +def test_to_decimal(): + decimalplaces = 3 + assert converters.to_decimal("12.345", "decimal", decimalplaces) == "12.345" + assert converters.to_decimal("12.3456", "decimal", decimalplaces) == "12.346" + assert converters.to_decimal("12.3", "decimal", decimalplaces) == "12.300" + assert converters.to_decimal(12.3456, "decimal", decimalplaces) == "12.346" + assert converters.to_decimal("1.0", "decimal", decimalplaces) == "1.000" + assert converters.to_decimal(0, "decimal", decimalplaces) == "0.000" + assert converters.to_decimal("date", "") == "date" + assert converters.to_decimal("", "decimal", decimalplaces) == "" + assert converters.to_decimal(None, "decimal", decimalplaces) == "" + + +def test_to_regex(): + pattern=r"[A-Za-z]{2}\d{10}" + assert converters.to_regex("AB1234567890",pattern) == "AB1234567890" # match + assert converters.to_regex("AB1234567890123456",pattern) == "" # too long + assert converters.to_regex("AB12345",pattern) == "" # too short + assert converters.to_regex("date", "") == "date" # no pattern + assert converters.to_regex("", pattern) == "" # no value + assert converters.to_regex(None, pattern) == "" # no value + + +# test_to_category() +# test_to_integer() +test_to_decimal() +#test_to_regex() From be180f9cf23c7279f7d83520d5aebb52f5eefcf6 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 10 Jul 2023 12:11:42 +0000 Subject: [PATCH 26/40] Update test_converters --- tests/social_work_workforce/test_converters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/social_work_workforce/test_converters.py b/tests/social_work_workforce/test_converters.py index d5d2bce2..4796cf90 100644 --- a/tests/social_work_workforce/test_converters.py +++ b/tests/social_work_workforce/test_converters.py @@ -61,5 +61,5 @@ def test_to_regex(): # test_to_category() # test_to_integer() -test_to_decimal() -#test_to_regex() +# test_to_decimal() +# test_to_regex() From 119d4ab75f80a33da51e95130ed365494e7d36df Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 12 Jul 2023 13:47:21 +0000 Subject: [PATCH 27/40] Implement logger, write unit tests --- .../csww_main_functions.py | 9 +- .../lds_csww_clean/cleaner.py | 27 +- .../lds_csww_clean/converters.py | 52 ++-- .../lds_csww_clean/logger.py | 68 ++--- tests/social_work_workforce/test_cleaner.py | 268 ++++++++++++++++++ .../social_work_workforce/test_converters.py | 25 +- tests/social_work_workforce/test_logger.py | 144 ++++++++++ 7 files changed, 506 insertions(+), 87 deletions(-) create mode 100644 tests/social_work_workforce/test_cleaner.py create mode 100644 tests/social_work_workforce/test_logger.py diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 438574e4..ac0f3d5e 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -108,8 +108,8 @@ def cleanfile(input, la_code, la_log_dir, output): stream = dom_parse(input) # Get year from input file + filename = str(Path(input).resolve().stem) try: - filename = str(Path(input).resolve().stem) input_year = check_year(filename) except (AttributeError, ValueError): save_year_error(input, la_log_dir) @@ -132,17 +132,16 @@ def cleanfile(input, la_code, la_log_dir, output): stream = filters.add_context(stream) stream = filters.add_schema(stream, schema=Schema(input_year).schema) stream = filters.add_schema_dict(stream, schema_path=FilePath(input_year).path) - - # for e in stream: - # print(e.get('schema_dict')) # Clean stream stream = cleaner.clean_categories(stream) stream = cleaner.clean_dates(stream) stream = cleaner.clean_numeric(stream) stream = cleaner.clean_regex_string(stream) + stream = logger.log_errors(stream) # Output results + stream = logger.save_errors_la(stream, la_log_dir=la_log_dir, filename=filename) stream = csww_record.message_collector(stream) data_worker, data_lalevel = csww_record.export_table(stream) @@ -225,7 +224,7 @@ def pan_agg(input, la_code, output): cleanfile( "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", "BAD", - "/workspaces/liia_tools/liiatools/datasets/social_work_workforce/lds_csww_clean", + "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", ) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index 1948265d..22287af8 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -24,8 +24,8 @@ def clean_dates(event): """ dateformat = event.schema_dict["date"] try: - newtext = to_date(event.text, dateformat) - return event.from_event(event, text=newtext, error="0") + clean_text = to_date(event.text, dateformat) + return event.from_event(event, text=clean_text, error="0") except (AttributeError, TypeError, ValueError): return event.from_event(event, text="", error="1") @@ -42,9 +42,9 @@ def clean_categories(event): """ category = event.schema_dict["category"] try: - newtext = to_category(event.text, category) - if newtext != "error": - return event.from_event(event, text=newtext, error='0') + clean_text = to_category(event.text, category) + if clean_text != "error": + return event.from_event(event, text=clean_text, error='0') return event.from_event(event, text="", error="1") except (AttributeError, TypeError, ValueError): return event.from_event(event, text="", error="1") @@ -63,13 +63,12 @@ def clean_numeric(event): numeric = event.schema_dict["numeric"] try: if numeric == "integer": - newtext = to_integer(event.text, numeric) + clean_text = to_integer(event.text, numeric) elif numeric == "decimal": - #print(event.schema_dict["fixed"], event.schema_dict["decimal"]) - decimalplaces = int(event.schema_dict["decimal"]) - newtext = to_decimal(event.text, numeric, decimalplaces) - if newtext != "error": - return event.from_event(event, text=newtext, error='0') + decimal_places = int(event.schema_dict["decimal"]) + clean_text = to_decimal(event.text, numeric, decimal_places) + if clean_text != "error": + return event.from_event(event, text=clean_text, error='0') return event.from_event(event, text="", error="1") except (AttributeError, TypeError, ValueError): return event.from_event(event, text="", error="1") @@ -87,9 +86,9 @@ def clean_regex_string(event): """ pattern = event.schema_dict["regex_string"] try: - newtext = to_regex(event.text, pattern) - if newtext != "error": - return event.from_event(event, text=newtext, error="0") + clean_text = to_regex(event.text, pattern) + if clean_text != "error": + return event.from_event(event, text=clean_text, error="0") return event.from_event(event, text="", error="1") except (AttributeError, TypeError, ValueError): return event.from_event(event, text="", error="1") \ No newline at end of file diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py index 4cd72810..f3ba9ddc 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py @@ -12,7 +12,7 @@ def to_category(string, categories): :param string: Some string to convert into a category value :param categories: A list of dictionaries containing different category:value pairs - :return: Either a category value, "error" or blank string + :return: Either a category value, "error" if category is invalid or blank string """ for code in categories: if str(string).lower() == str(code["code"]).lower(): @@ -37,36 +37,39 @@ def to_integer(value, config): :param value: Some value to convert to an integer :param config: The loaded configuration - :return: Either an integer value or a blank string + :return: Either an integer value or an "error" string if value could not be formatted as integer or a blank string if no value provided """ if config == "integer": - if isinstance(value, str) and value[-2:] == ".0": - return int(float(value)) - elif value or value == 0: - return int(value) - else: - return "" + if value or value==0: + if isinstance(value, str) and value[-2:] == ".0": + return int(float(value)) + elif value or value == 0: + return int(value) + else: + return "error" # value incorrectly formatted + return "" # no value provided else: return value -def to_decimal(value, config, decplaces=0): +def to_decimal(value, config, decimal_places=0): """ Convert any strings that should be decimal based on the config into decimals :param value: Some value to convert to a decimal :param config: The loaded configuration - :param decplaces: The number of decimal places - :return: Either a decimal value formatted to number of decimal places or a blank string + :param dec_places: The number of decimal places to apply (default 0) + :return: Either a decimal value formatted to number of decimal places or an "error" string if value could not be formatted as decimal or a blank string if no value provided """ - dpdisplayformat= f".{decplaces}f" if config == "decimal": - try: - float(value) - roundtodp = round(float(value), decplaces) - return f"{roundtodp: {dpdisplayformat}}".strip() - except (ValueError, TypeError): - return "" + if value or value == 0: + try: + float(value) + round_to_dp = round(float(value), decimal_places) + return round_to_dp + except (ValueError, TypeError): + return "error" # value incorrectly formatted + return "" # no value provided return value @@ -76,19 +79,20 @@ def to_regex(value, pattern): :param value: Some value to convert to a regex string :param pattern: The regex pattern to compare - :return: Either a regex string or a blank string + :return: Either a string matching the regex pattern or an "error" string if value does not match pattern or a blank string if no value provided """ if pattern: if value: + stripped_value = value.strip() try: - isfullmatch = re.fullmatch(pattern, value) + isfullmatch = re.fullmatch(pattern, stripped_value) if isfullmatch: - return value + return stripped_value else: - return "" + return "error" # value does not match regex pattern except (ValueError, TypeError): - return "" + return "error" # value incorrectly formatted else: - return "" + return "" # no value provided else: return value \ No newline at end of file diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py index 1fc9cf88..f9532d14 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py @@ -14,7 +14,7 @@ class ErrorTable(events.ParseEvent): pass -def create_formatting_error_count(stream): +def create_formatting_error_list(stream): """ Create a list of the column headers for cells with formatting errors (event.error = 1) for each table @@ -56,16 +56,16 @@ def blank_error_check(event): """ try: allowed_blank = event.schema_dict["canbeblank"] - error = getattr(event, "error", "0") - if not allowed_blank and not event.text and error != "1": + format_error = getattr(event, "error", "0") + if not allowed_blank and not event.text and format_error != "1": return event.from_event(event, blank_error="1") else: return event - except AttributeError: # Raised in case there is no config item for the given cell + except AttributeError: # Raised in case there is no schema dict for the given cell pass -def create_blank_error_count(stream): +def create_blank_error_list(stream): """ Create a list of the column headers for cells with blank fields that should not be blank (event.blank_error = 1) for each table @@ -75,18 +75,21 @@ def create_blank_error_count(stream): """ blank_error_count = None for event in stream: - if isinstance(event, events.StartTable): + if isinstance(event, events.StartElement) and event.tag == "LALevelVacancies": blank_error_count = [] - elif isinstance(event, events.EndTable): + elif isinstance(event, events.EndElement) and event.tag == "Message": blank_error_count = None elif isinstance(event, ErrorTable): yield ErrorTable.from_event(event, blank_error_count=blank_error_count) blank_error_count = None - elif blank_error_count is not None and isinstance(event, events.Cell): + elif ( + blank_error_count is not None + and isinstance(event, events.TextNode) + ): try: if event.blank_error == "1": - blank_error_count.append(event.header) - except AttributeError: + blank_error_count.append(event.schema.name) + except AttributeError: # Raised in case there is no event.blank_error pass yield event @@ -142,13 +145,14 @@ def create_extra_column_error(event): ) -def save_errors_la(stream, la_log_dir): +def save_errors_la(stream, la_log_dir, filename): """ Count the error events and save them as a text file in the Local Authority Logs directory only save the error events if there is at least one error in said event :param stream: A filtered list of event objects :param la_log_dir: Location to save the gathered error logs + :param filename: Filename to use :return: An updated list of event objects """ start_time = f"{datetime.now():%Y-%m-%dT%H%M%SZ}" @@ -157,14 +161,12 @@ def save_errors_la(stream, la_log_dir): if isinstance(event, ErrorTable) and ( event.formatting_error_count is not None and event.blank_error_count is not None - and event.table_name is not None ): if event.formatting_error_count or event.blank_error_count: with open( - f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", + f"{os.path.join(la_log_dir, filename)}_error_log_{start_time}.txt", "a", ) as f: - f.write(event.table_name) f.write("\n") if event.formatting_error_count: f.write( @@ -190,23 +192,23 @@ def save_errors_la(stream, la_log_dir): except AttributeError: pass - if isinstance(event, events.StartTable): - match_error = getattr(event, "match_error", None) - if match_error: - with open( - f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", - "a", - ) as f: - f.write(match_error) - f.write("\n") - column_error = getattr(event, "extra_column_error", None) - if column_error: - with open( - f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", - "a", - ) as f: - f.write(column_error) - f.write("\n") + # if isinstance(event, events.StartTable): + # match_error = getattr(event, "match_error", None) + # if match_error: + # with open( + # f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", + # "a", + # ) as f: + # f.write(match_error) + # f.write("\n") + # column_error = getattr(event, "extra_column_error", None) + # if column_error: + # with open( + # f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", + # "a", + # ) as f: + # f.write(column_error) + # f.write("\n") yield event @@ -218,8 +220,8 @@ def log_errors(stream): :return: An updated list of event objects """ stream = blank_error_check(stream) - # stream = create_formatting_error_count(stream) - # stream = create_blank_error_count(stream) + stream = create_formatting_error_list(stream) + stream = create_blank_error_list(stream) # stream = create_file_match_error(stream) # stream = create_extra_column_error(stream) return stream diff --git a/tests/social_work_workforce/test_cleaner.py b/tests/social_work_workforce/test_cleaner.py new file mode 100644 index 00000000..9da0d6ea --- /dev/null +++ b/tests/social_work_workforce/test_cleaner.py @@ -0,0 +1,268 @@ +from datetime import datetime +from sfdata_stream_parser import events +from liiatools.datasets.social_work_workforce.lds_csww_clean import cleaner + + +def test_clean_dates(): + event = events.TextNode(text=datetime(2019, 1, 15), schema_dict={"date": "%d/%m/%Y"}) + cleaned_event = list(cleaner.clean_dates(event))[0] + assert cleaned_event.text == datetime(2019, 1, 15).date() + assert cleaned_event.error == "0" + + event = events.TextNode(text="2019/1/15", schema_dict={"date": "%d/%m/%Y"}) + cleaned_event = list(cleaner.clean_dates(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode( + text=datetime(2019, 1, 15), schema_dict={"not_date": "%d/%m/%Y"} + ) + cleaned_event = list(cleaner.clean_dates(event))[0] + assert cleaned_event.text == datetime(2019, 1, 15) + + event = events.TextNode(text="string", schema_dict={"not_date": "%d/%m/%Y"}) + cleaned_event = list(cleaner.clean_dates(event))[0] + assert cleaned_event.text == "string" + + event = events.TextNode(text=None, schema_dict={"date": "%d/%m/%Y"}) + cleaned_event = list(cleaner.clean_dates(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + event = events.TextNode(text="", schema_dict={"date": "%d/%m/%Y"}) + cleaned_event = list(cleaner.clean_dates(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + +def test_clean_categories(): + event = events.TextNode( + text="0", + schema_dict={ + "category": [{"code": "0", "name": "False"}, {"code": "1", "name": "True"}] + }, + ) + cleaned_event = list(cleaner.clean_categories(event))[0] + assert cleaned_event.text == "0" + assert cleaned_event.error == "0" + + event = events.TextNode( + text="0.0", + schema_dict={ + "category": [{"code": "0", "name": "False"}, {"code": "1", "name": "True"}] + }, + ) + cleaned_event = list(cleaner.clean_categories(event))[0] + assert cleaned_event.text == "0" + assert cleaned_event.error == "0" + + event = events.TextNode( + text=0, + schema_dict={ + "category": [{"code": "0", "name": "False"}, {"code": "1", "name": "True"}] + }, + ) + cleaned_event = list(cleaner.clean_categories(event))[0] + assert cleaned_event.text == "0" + assert cleaned_event.error == "0" + + event = events.TextNode( + text="true", + schema_dict={ + "category": [{"code": "0", "name": "False"}, {"code": "1", "name": "True"}] + }, + ) + cleaned_event = list(cleaner.clean_categories(event))[0] + assert cleaned_event.text == "1" + assert cleaned_event.error == "0" + + event = events.TextNode( + text=123, + schema_dict={ + "category": [{"code": "0", "name": "False"}, {"code": "1", "name": "True"}] + }, + ) + cleaned_event = list(cleaner.clean_categories(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode( + text="string", + schema_dict={ + "category": [{"code": "0", "name": "False"}, {"code": "1", "name": "True"}] + }, + ) + cleaned_event = list(cleaner.clean_categories(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode( + text="string", + schema_dict={ + "not_category": [ + {"code": "0", "name": "False"}, + {"code": "1", "name": "True"}, + ] + }, + ) + cleaned_event = list(cleaner.clean_categories(event))[0] + assert cleaned_event.text == "string" + + event = events.TextNode( + text=None, + schema_dict={ + "category": [{"code": "0", "name": "False"}, {"code": "1", "name": "True"}] + }, + ) + cleaned_event = list(cleaner.clean_categories(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + event = events.TextNode( + text="", + schema_dict={ + "category": [{"code": "0", "name": "False"}, {"code": "1", "name": "True"}] + }, + ) + cleaned_event = list(cleaner.clean_categories(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + +def test_clean_numeric_integer(): + event = events.TextNode(text=123, schema_dict={"numeric": "integer"}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == 123 + assert cleaned_event.error == "0" + + event = events.TextNode(text="", schema_dict={"numeric": "integer"}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + event = events.TextNode(text=None, schema_dict={"numeric": "integer"}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + event = events.TextNode(text="123", schema_dict={"numeric": "integer"}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == 123 + assert cleaned_event.error == "0" + + event = events.TextNode(text="string", schema_dict={"numeric": "integer"}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode(text=datetime(2017, 3, 17), schema_dict={"numeric": "integer"}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode( + text=datetime(2017, 3, 17), schema_dict={"not_numeric": "integer"} + ) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == datetime(2017, 3, 17) + + +def test_clean_numeric_decimal(): + event = events.TextNode(text=123.45, schema_dict={"numeric": "decimal", "decimal": 2}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == 123.45 + assert cleaned_event.error == "0" + + event = events.TextNode(text=123.4567, schema_dict={"numeric": "decimal", "decimal": 2}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == 123.46 + assert cleaned_event.error == "0" + + event = events.TextNode(text=123.45, schema_dict={"numeric": "decimal", "decimal": 0}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == 123 + assert cleaned_event.error == "0" + + event = events.TextNode(text=123.456, schema_dict={"numeric": "decimal", "decimal": 6}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == 123.456 + assert cleaned_event.error == "0" + + event = events.TextNode(text="", schema_dict={"numeric": "decimal", "decimal": 2}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + event = events.TextNode(text=None, schema_dict={"numeric": "decimal", "decimal": 2}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + event = events.TextNode(text="123.4567", schema_dict={"numeric": "decimal", "decimal": 2}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == 123.46 + assert cleaned_event.error == "0" + + event = events.TextNode(text="string", schema_dict={"numeric": "decimal", "decimal": 2}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode(text=datetime(2017, 3, 17), schema_dict={"numeric": "decimal", "decimal": 2}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode( + text=datetime(2017, 3, 17), schema_dict={"not_numeric": "decimal"} + ) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == datetime(2017, 3, 17) + +def test_clean_regex_string(): + event = events.TextNode(text="AB1234567890", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + cleaned_event = list(cleaner.clean_regex_string(event))[0] + assert cleaned_event.text == "AB1234567890" + assert cleaned_event.error == "0" + + event = events.TextNode(text="", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + cleaned_event = list(cleaner.clean_regex_string(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + event = events.TextNode(text=None, schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + cleaned_event = list(cleaner.clean_regex_string(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "0" + + event = events.TextNode(text=" AB1234567890 ", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + cleaned_event = list(cleaner.clean_regex_string(event))[0] + assert cleaned_event.text == "AB1234567890" + assert cleaned_event.error == "0" + + event = events.TextNode(text="AB123456", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + cleaned_event = list(cleaner.clean_regex_string(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode(text="AB1234567890123456", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + cleaned_event = list(cleaner.clean_regex_string(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode(text="AB12345 67890", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + cleaned_event = list(cleaner.clean_regex_string(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.error == "1" + + event = events.TextNode(text="string", schema_dict={"not_regex_string": r"[A-Za-z]{2}\d{10}"}) + cleaned_event = list(cleaner.clean_regex_string(event))[0] + assert cleaned_event.text == "string" + + +# test_clean_dates() +# test_clean_categories() +# test_clean_numeric_integer() +# test_clean_numeric_decimal() +# test_clean_regex_string() + diff --git a/tests/social_work_workforce/test_converters.py b/tests/social_work_workforce/test_converters.py index 4796cf90..88b8ff1d 100644 --- a/tests/social_work_workforce/test_converters.py +++ b/tests/social_work_workforce/test_converters.py @@ -31,29 +31,32 @@ def test_to_integer(): assert converters.to_integer(123, "integer") == 123 assert converters.to_integer("1.0", "integer") == 1 assert converters.to_integer("date", "") == "date" + assert converters.to_integer(123.456, "integer") == 123 assert converters.to_integer(0, "integer") == 0 assert converters.to_integer("", "integer") == "" assert converters.to_integer(None, "integer") == "" def test_to_decimal(): - decimalplaces = 3 - assert converters.to_decimal("12.345", "decimal", decimalplaces) == "12.345" - assert converters.to_decimal("12.3456", "decimal", decimalplaces) == "12.346" - assert converters.to_decimal("12.3", "decimal", decimalplaces) == "12.300" - assert converters.to_decimal(12.3456, "decimal", decimalplaces) == "12.346" - assert converters.to_decimal("1.0", "decimal", decimalplaces) == "1.000" - assert converters.to_decimal(0, "decimal", decimalplaces) == "0.000" + decimal_places = 3 + assert converters.to_decimal("12.345", "decimal", decimal_places) == 12.345 + assert converters.to_decimal("12.3456", "decimal", decimal_places) == 12.346 + assert converters.to_decimal("12.3", "decimal", decimal_places) == 12.3 + assert converters.to_decimal(12.3456, "decimal", decimal_places) == 12.346 + assert converters.to_decimal("1.0", "decimal", decimal_places) == 1 + assert converters.to_decimal(0, "decimal", decimal_places) == 0 assert converters.to_decimal("date", "") == "date" - assert converters.to_decimal("", "decimal", decimalplaces) == "" - assert converters.to_decimal(None, "decimal", decimalplaces) == "" + assert converters.to_decimal("", "decimal", decimal_places) == "" + assert converters.to_decimal(None, "decimal", decimal_places) == "" def test_to_regex(): pattern=r"[A-Za-z]{2}\d{10}" assert converters.to_regex("AB1234567890",pattern) == "AB1234567890" # match - assert converters.to_regex("AB1234567890123456",pattern) == "" # too long - assert converters.to_regex("AB12345",pattern) == "" # too short + assert converters.to_regex(" AB1234567890 ",pattern) == "AB1234567890" # match + assert converters.to_regex("AB1234567890123456",pattern) == "error" # too long + assert converters.to_regex("AB12345",pattern) == "error" # too short + assert converters.to_regex("xxxxOz2054309383",pattern) == "error" # invalid format assert converters.to_regex("date", "") == "date" # no pattern assert converters.to_regex("", pattern) == "" # no value assert converters.to_regex(None, pattern) == "" # no value diff --git a/tests/social_work_workforce/test_logger.py b/tests/social_work_workforce/test_logger.py new file mode 100644 index 00000000..550887f0 --- /dev/null +++ b/tests/social_work_workforce/test_logger.py @@ -0,0 +1,144 @@ +import tempfile as tmp +from unittest.mock import patch +from pathlib import Path +from datetime import datetime + +from liiatools.datasets.social_work_workforce.lds_csww_clean import logger + +from sfdata_stream_parser import events + + +def test_create_formatting_error_list(): + stream = ( + events.StartTable(table_name="AD1"), + events.Cell(header="some_header", error="1"), + events.Cell(header="some_header", error="1"), + events.Cell(header="some_header", error="0"), + events.EndTable(), + ) + events_with_formatting_error_count = list( + logger.create_formatting_error_list(stream) + ) + for event in events_with_formatting_error_count: + if isinstance(event, logger.ErrorTable): + assert event.formatting_error_count == [ + "some_header", + "some_header", + ] + + stream = ( + events.StartTable(table_name="AD1"), + events.Cell(header="some_header", error="1"), + events.Cell(header="some_other_header", error="1"), + events.Cell(header="some_header"), + events.EndTable(), + ) + events_with_formatting_error_count = list( + logger.create_formatting_error_list(stream) + ) + for event in events_with_formatting_error_count: + if isinstance(event, logger.ErrorTable): + assert event.formatting_error_count == [ + "some_header", + "some_other_header", + ] + + stream = ( + events.StartTable(table_name="AD1"), + events.Cell(header="some_header", error="1"), + events.Cell(header="some_header_2", error=None), + events.Cell(header="some_header_3", error=""), + events.Cell(), + events.EndTable(), + ) + events_with_formatting_error_count = list( + logger.create_formatting_error_list(stream) + ) + for event in events_with_formatting_error_count: + if isinstance(event, logger.ErrorTable): + assert event.formatting_error_count == ["some_header"] + + +def test_blank_error_check(): + stream = logger.blank_error_check( + [ + events.TextNode(schema_dict={"canbeblank": False}, text="", error="0"), + events.TextNode(schema_dict={"canbeblank": False}, text=None, error="0"), + events.TextNode(schema_dict={"canbeblank": False}, text="", error="1"), + events.TextNode(schema_dict={"canbeblank": False}, text="string", error="0"), + events.TextNode(schema_dict={"canbeblank": True}, text="", error="0"), + ] + ) + # for e in stream: + # print(e.as_dict()) + stream = list(stream) + # print(stream[1].as_dict()) + assert stream[0].blank_error == "1" + assert stream[1].blank_error == "1" + assert "blank_error" not in stream[2].as_dict() + assert "blank_error" not in stream[3].as_dict() + assert "blank_error" not in stream[4].as_dict() + + +def test_create_blank_error_list(): + stream = ( + events.StartTable(), + events.Cell(header="some_header", blank_error="1"), + events.Cell(header="some_header_2", blank_error=None), + events.Cell(header="some_header_3", blank_error=""), + events.Cell(), + logger.ErrorTable(), + events.EndTable(), + ) + events_with_blank_error_count = list(logger.create_blank_error_list(stream)) + for event in events_with_blank_error_count: + if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: + assert event.blank_error_count == ["some_header"] + + stream = ( + events.StartTable(), + events.Cell(header="some_header", blank_error="1"), + events.Cell(header="some_header_2", blank_error="1"), + events.Cell(header="some_header_3", blank_error=""), + events.Cell(), + logger.ErrorTable(), + ) + events_with_blank_error_count = list(logger.create_blank_error_list(stream)) + for event in events_with_blank_error_count: + if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: + assert event.blank_error_count == [ + "some_header", + "some_header_2", + ] + + +@patch("builtins.open", create=True) +def test_save_errors_la(mock_save): + la_log_dir = tmp.gettempdir() + start_time = f"{datetime.now():%Y-%m-%dT%H%M%SZ}" + + stream = logger.save_errors_la( + [ + logger.ErrorTable( + filename="test_file", + formatting_error_count=["CHILD", "CHILD", "AGE"], + blank_error_count=["POSTCODE", "POSTCODE", "DATE"], + table_name="List 1", + extra_column_error=["list", "of", "headers"], + ), + ], + la_log_dir, + ) + stream = list(stream) + + mock_save.assert_called_once_with( + f"{Path(la_log_dir, 'test_file')}_error_log_{start_time}.txt", "a" + ) + # mock_save.write.assert_called_once_with(f"test_file_{start_time}") + +# test_create_formatting_error_list() +# test_blank_error_check() +# test_create_blank_error_list() +# test_save_errors_la() + + From 094faf41af0d1d24fc2bab638d85f5f975ca5650 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 17 Jul 2023 17:39:18 +0000 Subject: [PATCH 28/40] Fix formatting_error name; run Black --- .../csww_main_functions.py | 9 +- .../lds_csww_clean/cleaner.py | 60 ++++++++--- .../lds_csww_clean/logger.py | 48 ++++----- .../spec/social_work_workforce/pan-agg.yml | 2 +- tests/social_work_workforce/test_logger.py | 101 +++++++++--------- 5 files changed, 123 insertions(+), 97 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index ac0f3d5e..1c65b7cc 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -11,7 +11,7 @@ from liiatools.datasets.social_work_workforce.lds_csww_clean.xml import ( etree, to_xml, - dom_parse + dom_parse, ) from liiatools.datasets.social_work_workforce.lds_csww_clean.schema import ( Schema, @@ -132,12 +132,9 @@ def cleanfile(input, la_code, la_log_dir, output): stream = filters.add_context(stream) stream = filters.add_schema(stream, schema=Schema(input_year).schema) stream = filters.add_schema_dict(stream, schema_path=FilePath(input_year).path) - # Clean stream - stream = cleaner.clean_categories(stream) - stream = cleaner.clean_dates(stream) - stream = cleaner.clean_numeric(stream) - stream = cleaner.clean_regex_string(stream) + # Clean stream + stream = cleaner.clean(stream) stream = logger.log_errors(stream) # Output results diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index 22287af8..fb4e857f 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -4,16 +4,22 @@ from sfdata_stream_parser import events from sfdata_stream_parser.filters.generic import streamfilter, pass_event -from liiatools.datasets.social_work_workforce.lds_csww_clean.converters import to_category, to_integer, to_decimal, to_regex +from liiatools.datasets.social_work_workforce.lds_csww_clean.converters import ( + to_category, + to_integer, + to_decimal, + to_regex, +) from liiatools.datasets.shared_functions.converters import to_date -#from liiatools.datasets.shared_functions.common import check_postcode log = logging.getLogger(__name__) @streamfilter( - check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event + check=type_check(events.TextNode), + fail_function=pass_event, + error_function=pass_event, ) def clean_dates(event): """ @@ -25,13 +31,15 @@ def clean_dates(event): dateformat = event.schema_dict["date"] try: clean_text = to_date(event.text, dateformat) - return event.from_event(event, text=clean_text, error="0") + return event.from_event(event, text=clean_text, formatting_error="0") except (AttributeError, TypeError, ValueError): - return event.from_event(event, text="", error="1") + return event.from_event(event, text="", formatting_error="1") @streamfilter( - check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event + check=type_check(events.TextNode), + fail_function=pass_event, + error_function=pass_event, ) def clean_categories(event): """ @@ -44,14 +52,16 @@ def clean_categories(event): try: clean_text = to_category(event.text, category) if clean_text != "error": - return event.from_event(event, text=clean_text, error='0') - return event.from_event(event, text="", error="1") + return event.from_event(event, text=clean_text, formatting_error="0") + return event.from_event(event, text="", formatting_error="1") except (AttributeError, TypeError, ValueError): - return event.from_event(event, text="", error="1") + return event.from_event(event, text="", formatting_error="1") @streamfilter( - check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event + check=type_check(events.TextNode), + fail_function=pass_event, + error_function=pass_event, ) def clean_numeric(event): """ @@ -68,14 +78,16 @@ def clean_numeric(event): decimal_places = int(event.schema_dict["decimal"]) clean_text = to_decimal(event.text, numeric, decimal_places) if clean_text != "error": - return event.from_event(event, text=clean_text, error='0') - return event.from_event(event, text="", error="1") + return event.from_event(event, text=clean_text, formatting_error="0") + return event.from_event(event, text="", formatting_error="1") except (AttributeError, TypeError, ValueError): - return event.from_event(event, text="", error="1") + return event.from_event(event, text="", formatting_error="1") @streamfilter( - check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event + check=type_check(events.TextNode), + fail_function=pass_event, + error_function=pass_event, ) def clean_regex_string(event): """ @@ -88,7 +100,21 @@ def clean_regex_string(event): try: clean_text = to_regex(event.text, pattern) if clean_text != "error": - return event.from_event(event, text=clean_text, error="0") - return event.from_event(event, text="", error="1") + return event.from_event(event, text=clean_text, formatting_error="0") + return event.from_event(event, text="", formatting_error="1") except (AttributeError, TypeError, ValueError): - return event.from_event(event, text="", error="1") \ No newline at end of file + return event.from_event(event, text="", formatting_error="1") + + +def clean(stream): + """ + Compile the cleaning functions + + :param event: A list of event objects + :return: An updated list of event objects + """ + stream = clean_dates(stream) + stream = clean_categories(stream) + stream = clean_numeric(stream) + stream = clean_regex_string(stream) + return stream diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py index f9532d14..0765bfcd 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py @@ -21,24 +21,24 @@ def create_formatting_error_list(stream): :param stream: A filtered list of event objects :return: An updated list of event objects with error counts """ - formatting_error_count = None + formatting_error_list = None for event in stream: if isinstance(event, events.StartElement) and event.tag == "LALevelVacancies": - formatting_error_count = [] + formatting_error_list = [] elif isinstance(event, events.EndElement) and event.tag == "Message": yield ErrorTable.from_event( event, - formatting_error_count=formatting_error_count, + formatting_error_list=formatting_error_list, ) - formatting_error_count = None + formatting_error_list = None elif ( - formatting_error_count is not None + formatting_error_list is not None and isinstance(event, events.TextNode) ): try: - if event.error == "1": - formatting_error_count.append(event.schema.name) - except AttributeError: # Raised in case there is no event.error + if event.formatting_error == "1": + formatting_error_list.append(event.schema.name) + except AttributeError: # Raised in case there is no event.formatting_error pass yield event @@ -56,8 +56,8 @@ def blank_error_check(event): """ try: allowed_blank = event.schema_dict["canbeblank"] - format_error = getattr(event, "error", "0") - if not allowed_blank and not event.text and format_error != "1": + formatting_error = getattr(event, "formatting_error", "0") + if not allowed_blank and not event.text and formatting_error != "1": return event.from_event(event, blank_error="1") else: return event @@ -73,22 +73,22 @@ def create_blank_error_list(stream): :param stream: A filtered list of event objects :return: An updated list of event objects """ - blank_error_count = None + blank_error_list = None for event in stream: if isinstance(event, events.StartElement) and event.tag == "LALevelVacancies": - blank_error_count = [] + blank_error_list = [] elif isinstance(event, events.EndElement) and event.tag == "Message": - blank_error_count = None + blank_error_list = None elif isinstance(event, ErrorTable): - yield ErrorTable.from_event(event, blank_error_count=blank_error_count) - blank_error_count = None + yield ErrorTable.from_event(event, blank_error_list=blank_error_list) + blank_error_list = None elif ( - blank_error_count is not None + blank_error_list is not None and isinstance(event, events.TextNode) ): try: if event.blank_error == "1": - blank_error_count.append(event.schema.name) + blank_error_list.append(event.schema.name) except AttributeError: # Raised in case there is no event.blank_error pass yield event @@ -159,32 +159,32 @@ def save_errors_la(stream, la_log_dir, filename): for event in stream: try: if isinstance(event, ErrorTable) and ( - event.formatting_error_count is not None - and event.blank_error_count is not None + event.formatting_error_list is not None + and event.blank_error_list is not None ): - if event.formatting_error_count or event.blank_error_count: + if event.formatting_error_list or event.blank_error_list: with open( f"{os.path.join(la_log_dir, filename)}_error_log_{start_time}.txt", "a", ) as f: f.write("\n") - if event.formatting_error_count: + if event.formatting_error_list: f.write( "Number of cells that have been made blank " "because they could not be formatted correctly" ) f.write("\n") - counter_dict = Counter(event.formatting_error_count) + counter_dict = Counter(event.formatting_error_list) f.write( str(counter_dict)[9:-2] ) # Remove "Counter({" and "})" from string f.write("\n") - if event.blank_error_count: + if event.blank_error_list: f.write( "Number of blank cells that should have contained data" ) f.write("\n") - blank_counter_dict = Counter(event.blank_error_count) + blank_counter_dict = Counter(event.blank_error_list) f.write( str(blank_counter_dict)[9:-2] ) # Remove "Counter({" and "})" from string diff --git a/liiatools/spec/social_work_workforce/pan-agg.yml b/liiatools/spec/social_work_workforce/pan-agg.yml index c6bdad4e..ecf3b9b6 100644 --- a/liiatools/spec/social_work_workforce/pan-agg.yml +++ b/liiatools/spec/social_work_workforce/pan-agg.yml @@ -28,4 +28,4 @@ column_names: pan_data_kept: - CSWWWorker - - LALevelVacanciespisodes \ No newline at end of file + - LALevelVacancies \ No newline at end of file diff --git a/tests/social_work_workforce/test_logger.py b/tests/social_work_workforce/test_logger.py index 550887f0..aa291052 100644 --- a/tests/social_work_workforce/test_logger.py +++ b/tests/social_work_workforce/test_logger.py @@ -1,4 +1,5 @@ import tempfile as tmp +import xmlschema from unittest.mock import patch from pathlib import Path from datetime import datetime @@ -11,9 +12,9 @@ def test_create_formatting_error_list(): stream = ( events.StartTable(table_name="AD1"), - events.Cell(header="some_header", error="1"), - events.Cell(header="some_header", error="1"), - events.Cell(header="some_header", error="0"), + events.Cell(header="some_header", formatting_error="1"), + events.Cell(header="some_header", formatting_error="1"), + events.Cell(header="some_header", formatting_error="0"), events.EndTable(), ) events_with_formatting_error_count = list( @@ -28,8 +29,8 @@ def test_create_formatting_error_list(): stream = ( events.StartTable(table_name="AD1"), - events.Cell(header="some_header", error="1"), - events.Cell(header="some_other_header", error="1"), + events.Cell(header="some_header", formatting_error="1"), + events.Cell(header="some_other_header", formatting_error="1"), events.Cell(header="some_header"), events.EndTable(), ) @@ -45,9 +46,9 @@ def test_create_formatting_error_list(): stream = ( events.StartTable(table_name="AD1"), - events.Cell(header="some_header", error="1"), - events.Cell(header="some_header_2", error=None), - events.Cell(header="some_header_3", error=""), + events.Cell(header="some_header", formatting_error="1"), + events.Cell(header="some_header_2", formatting_error=None), + events.Cell(header="some_header_3", formatting_error=""), events.Cell(), events.EndTable(), ) @@ -62,17 +63,14 @@ def test_create_formatting_error_list(): def test_blank_error_check(): stream = logger.blank_error_check( [ - events.TextNode(schema_dict={"canbeblank": False}, text="", error="0"), - events.TextNode(schema_dict={"canbeblank": False}, text=None, error="0"), - events.TextNode(schema_dict={"canbeblank": False}, text="", error="1"), - events.TextNode(schema_dict={"canbeblank": False}, text="string", error="0"), - events.TextNode(schema_dict={"canbeblank": True}, text="", error="0"), + events.TextNode(schema_dict={"canbeblank": False}, text="", formatting_error="0"), + events.TextNode(schema_dict={"canbeblank": False}, text=None, formatting_error="0"), + events.TextNode(schema_dict={"canbeblank": False}, text="", formatting_error="1"), + events.TextNode(schema_dict={"canbeblank": False}, text="string", formatting_error="0"), + events.TextNode(schema_dict={"canbeblank": True}, text="", formatting_error="0"), ] ) - # for e in stream: - # print(e.as_dict()) stream = list(stream) - # print(stream[1].as_dict()) assert stream[0].blank_error == "1" assert stream[1].blank_error == "1" assert "blank_error" not in stream[2].as_dict() @@ -80,36 +78,41 @@ def test_blank_error_check(): assert "blank_error" not in stream[4].as_dict() -def test_create_blank_error_list(): - stream = ( - events.StartTable(), - events.Cell(header="some_header", blank_error="1"), - events.Cell(header="some_header_2", blank_error=None), - events.Cell(header="some_header_3", blank_error=""), - events.Cell(), - logger.ErrorTable(), - events.EndTable(), - ) - events_with_blank_error_count = list(logger.create_blank_error_list(stream)) - for event in events_with_blank_error_count: - if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: - assert event.blank_error_count == ["some_header"] - - stream = ( - events.StartTable(), - events.Cell(header="some_header", blank_error="1"), - events.Cell(header="some_header_2", blank_error="1"), - events.Cell(header="some_header_3", blank_error=""), - events.Cell(), - logger.ErrorTable(), - ) - events_with_blank_error_count = list(logger.create_blank_error_list(stream)) - for event in events_with_blank_error_count: - if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: - assert event.blank_error_count == [ - "some_header", - "some_header_2", - ] +# def test_create_blank_error_list(): +# schema = xmlschema.XsdElement('some_header',"xxx",None,True) +# stream = ( +# events.StartElement(tag="LALevelVacancies"), +# events.TextNode(text="text_1", schema=schema, blank_error="1"), +# # events.TextNode(text="text_2", schema={'name': 'some_header_2'}, blank_error=None), +# # events.TextNode(text="text_3", schema={'name': 'some_header_3'}, blank_error=""), +# # events.TextNode(text="text_4", schema="header {'name': 'some_header_4'}')"), +# events.EndElement(tag="Message"), +# logger.ErrorTable(), +# ) +# events_with_blank_error_count = list(logger.create_blank_error_list(stream)) +# print(f"blank error headers = {events_with_blank_error_count}") +# for event in events_with_blank_error_count: +# print(event.schema.name) +# if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: +# assert event.blank_error_count == ["some_header"] + +# stream = ( +# events.StartElement(tag="LALevelVacancies"), +# events.TextNode(text="some_header", blank_error="1"), +# events.TextNode(text="some_header_2", blank_error="1"), +# events.TextNode(text="some_header_3", blank_error=""), +# events.TextNode(text="some_header_4"), +# events.EndElement(tag="Message"), +# logger.ErrorTable(), +# ) +# events_with_blank_error_list = list(logger.create_blank_error_list(stream)) +# for event in events_with_blank_error_list: +# if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: +# print(event.blank_error_count) +# assert event.blank_error_count == [ +# "some_header", +# "some_header_2", +# ] @patch("builtins.open", create=True) @@ -136,9 +139,9 @@ def test_save_errors_la(mock_save): ) # mock_save.write.assert_called_once_with(f"test_file_{start_time}") -# test_create_formatting_error_list() -# test_blank_error_check() -# test_create_blank_error_list() +test_create_formatting_error_list() +test_blank_error_check() +test_create_blank_error_list() # test_save_errors_la() From 635b472045213ec85340235957ddb67dcd24a610 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:13:15 +0000 Subject: [PATCH 29/40] Update logger, create tests, run black --- .../csww_main_functions.py | 12 +-- .../lds_csww_clean/filters.py | 23 +++++- .../lds_csww_clean/logger.py | 70 ---------------- tests/social_work_workforce/test_logger.py | 82 ++++--------------- 4 files changed, 45 insertions(+), 142 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 1c65b7cc..62ea8bb1 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -218,12 +218,12 @@ def pan_agg(input, la_code, output): # Run in Visual Studio Code |> -cleanfile( - "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml", - "BAD", - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", - "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -) +# cleanfile( +# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022_sc.xml", +# "BAD", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", +# ) # la_agg( # "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py index eafb9297..3073a5c8 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py @@ -76,7 +76,10 @@ def _create_category_dict(field: str, file: str): documentation = element.findall(search_doc) for i, d in enumerate(documentation): name_dict = {"name": d.text} - category_dict["category"][i] = {**category_dict["category"][i], **name_dict} + category_dict["category"][i] = { + **category_dict["category"][i], + **name_dict, + } return category_dict @@ -94,7 +97,9 @@ def _create_float_dict(field: str, file: str): search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction" restriction = element.findall(search_restriction) for r in restriction: - code_dict = {"numeric": r.get("base")[3:]} # Remove the "xs:" from the start of the base string + code_dict = { + "numeric": r.get("base")[3:] + } # Remove the "xs:" from the start of the base string if code_dict["numeric"] == "decimal": float_dict = code_dict @@ -120,6 +125,13 @@ def _create_float_dict(field: str, file: str): def _create_regex_dict(field: str, file: str): + """ + Parse an XML file and extract the regex pattern for a given field name + + :param field: The name of the field to look for in the XML file + :param file: The path to the XML file + :return: A dictionary with the key "regex_string" and the value as the regex pattern, or None if no pattern is found + """ regex_dict = None xsd_xml = ET.parse(file) @@ -163,6 +175,13 @@ def add_schema(event, schema: xmlschema.XMLSchema): @streamfilter(check=type_check(events.TextNode), fail_function=pass_event) def add_schema_dict(event, schema_path: str): + """ + Add a dictionary of schema attributes to an event object based on its type and occurrence + + :param event: An event object with a schema attribute + :param schema_path: The path to the schema file + :return: A new event object with a schema_dict attribute, or the original event object if no schema_dict is found + """ schema_dict = None config_type = event.schema.type.name diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py index 0765bfcd..8c7e715a 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py @@ -94,57 +94,6 @@ def create_blank_error_list(stream): yield event -@streamfilter( - check=type_check(events.StartTable), - fail_function=pass_event, - error_function=pass_event, -) -def create_file_match_error(event): - """ - Add a match_error to StartTables that do not have an event.sheet_name so these errors can be written to the log.txt - file. If there is no event.sheet_name for a given StartTable that means its headers did not match any of those - in the config file - - :param event: A filtered list of event objects of type StartTable - :return: An updated list of event objects - """ - try: - if event.table_name: - return event - except AttributeError: - return event.from_event( - event, - match_error=f"Failed to find a set of matching columns headers for file titled " - f"'{event.filename}' which contains column headers {event.headers} so no output has been produced", - ) - return event - - -@streamfilter( - check=type_check(events.StartTable), - fail_function=pass_event, - error_function=pass_event, -) -def create_extra_column_error(event): - """ - Add a extra_column_error to StartTables that have more columns than the set of expected columns so these can be written to the log.txt - - :param event: A filtered list of event objects of type StartTable - :return: An updated list of event objects - """ - extra_columns = [ - item for item in event.headers if item not in event.expected_columns - ] - if len(extra_columns) == 0: - return event - else: - return event.from_event( - event, - extra_column_error=f"Additional columns were found in file titled " - f"'{event.filename}' than those expected from schema for filetype = {event.table_name}, so these columns have been removed: {extra_columns}", - ) - - def save_errors_la(stream, la_log_dir, filename): """ Count the error events and save them as a text file in the Local Authority Logs directory @@ -192,23 +141,6 @@ def save_errors_la(stream, la_log_dir, filename): except AttributeError: pass - # if isinstance(event, events.StartTable): - # match_error = getattr(event, "match_error", None) - # if match_error: - # with open( - # f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", - # "a", - # ) as f: - # f.write(match_error) - # f.write("\n") - # column_error = getattr(event, "extra_column_error", None) - # if column_error: - # with open( - # f"{os.path.join(la_log_dir, event.filename)}_error_log_{start_time}.txt", - # "a", - # ) as f: - # f.write(column_error) - # f.write("\n") yield event @@ -222,6 +154,4 @@ def log_errors(stream): stream = blank_error_check(stream) stream = create_formatting_error_list(stream) stream = create_blank_error_list(stream) - # stream = create_file_match_error(stream) - # stream = create_extra_column_error(stream) return stream diff --git a/tests/social_work_workforce/test_logger.py b/tests/social_work_workforce/test_logger.py index aa291052..ad00a80b 100644 --- a/tests/social_work_workforce/test_logger.py +++ b/tests/social_work_workforce/test_logger.py @@ -1,5 +1,4 @@ import tempfile as tmp -import xmlschema from unittest.mock import patch from pathlib import Path from datetime import datetime @@ -61,7 +60,7 @@ def test_create_formatting_error_list(): def test_blank_error_check(): - stream = logger.blank_error_check( + mock_stream = logger.blank_error_check( [ events.TextNode(schema_dict={"canbeblank": False}, text="", formatting_error="0"), events.TextNode(schema_dict={"canbeblank": False}, text=None, formatting_error="0"), @@ -70,7 +69,7 @@ def test_blank_error_check(): events.TextNode(schema_dict={"canbeblank": True}, text="", formatting_error="0"), ] ) - stream = list(stream) + stream = list(mock_stream) assert stream[0].blank_error == "1" assert stream[1].blank_error == "1" assert "blank_error" not in stream[2].as_dict() @@ -78,70 +77,25 @@ def test_blank_error_check(): assert "blank_error" not in stream[4].as_dict() -# def test_create_blank_error_list(): -# schema = xmlschema.XsdElement('some_header',"xxx",None,True) -# stream = ( -# events.StartElement(tag="LALevelVacancies"), -# events.TextNode(text="text_1", schema=schema, blank_error="1"), -# # events.TextNode(text="text_2", schema={'name': 'some_header_2'}, blank_error=None), -# # events.TextNode(text="text_3", schema={'name': 'some_header_3'}, blank_error=""), -# # events.TextNode(text="text_4", schema="header {'name': 'some_header_4'}')"), -# events.EndElement(tag="Message"), -# logger.ErrorTable(), -# ) -# events_with_blank_error_count = list(logger.create_blank_error_list(stream)) -# print(f"blank error headers = {events_with_blank_error_count}") -# for event in events_with_blank_error_count: -# print(event.schema.name) -# if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: -# assert event.blank_error_count == ["some_header"] - -# stream = ( -# events.StartElement(tag="LALevelVacancies"), -# events.TextNode(text="some_header", blank_error="1"), -# events.TextNode(text="some_header_2", blank_error="1"), -# events.TextNode(text="some_header_3", blank_error=""), -# events.TextNode(text="some_header_4"), -# events.EndElement(tag="Message"), -# logger.ErrorTable(), -# ) -# events_with_blank_error_list = list(logger.create_blank_error_list(stream)) -# for event in events_with_blank_error_list: -# if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: -# print(event.blank_error_count) -# assert event.blank_error_count == [ -# "some_header", -# "some_header_2", -# ] - - -@patch("builtins.open", create=True) -def test_save_errors_la(mock_save): - la_log_dir = tmp.gettempdir() - start_time = f"{datetime.now():%Y-%m-%dT%H%M%SZ}" - - stream = logger.save_errors_la( - [ - logger.ErrorTable( - filename="test_file", - formatting_error_count=["CHILD", "CHILD", "AGE"], - blank_error_count=["POSTCODE", "POSTCODE", "DATE"], - table_name="List 1", - extra_column_error=["list", "of", "headers"], - ), - ], - la_log_dir, +def test_create_blank_error_list(): + mock_stream = ( + events.StartElement(tag="LALevelVacancies"), + events.TextNode(text="some_header", blank_error="1"), + events.TextNode(text="some_header_2", blank_error="1"), + events.TextNode(text="some_header_3", blank_error=""), + events.TextNode(text="some_header_4"), + events.EndElement(tag="Message"), ) - stream = list(stream) + events_with_blank_error_list = list(logger.create_blank_error_list(mock_stream)) + for event in events_with_blank_error_list: + if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: + print(event.blank_error_list) + assert event.blank_error_list == [ + "some_header", + "some_header_2", + ] - mock_save.assert_called_once_with( - f"{Path(la_log_dir, 'test_file')}_error_log_{start_time}.txt", "a" - ) - # mock_save.write.assert_called_once_with(f"test_file_{start_time}") test_create_formatting_error_list() test_blank_error_check() test_create_blank_error_list() -# test_save_errors_la() - - From e627801b6c3428646e118122bb73d31aa2b4f2eb Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 1 Aug 2023 09:44:21 +0000 Subject: [PATCH 30/40] Add docstrings in SWFtools modules --- .../SWFtools/analysis/FTESum.py | 14 +++++++++++++ .../SWFtools/analysis/growth_tables.py | 4 ++++ .../SWFtools/analysis/seniority.py | 21 +++++++++++++++++++ .../lds_csww_clean/csww_record.py | 6 ++++++ 4 files changed, 45 insertions(+) diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py index 39c69088..3275bb2e 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py @@ -4,6 +4,13 @@ def FTESum(): + """ + Calculate the sum of FTE by LEAName, YearCensus, SeniorityCode and SeniorityName from + the input csv file + + :return: Excel file with the name FTESum_5d.xlsx and the same path as the input file + """ + # ===== Read file ===== # file = "CompMergSen.csv" requestPath = work_path.request @@ -26,6 +33,13 @@ def FTESum(): def FTESum_2020(): + """ + Read a CSV file and calculate the sum of FTE by LEAName, YearCensus, SeniorityCode and + SeniorityName for the year 2020 + + :return: Excel file with the name FTESum_2020.xlsx and the same path as the input file + """ + # ===== Read file ===== # file = "CompMergSen.csv" requestPath = work_path.request diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py index eb7dd8d6..4ad4bab2 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py @@ -8,6 +8,10 @@ def growth_tables(): + """ + Create two Excel files with tables of growth rates and population growth for six LEAs + """ + growth_rate_df = { "LEAName": [ "Havering", diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py index b2d91a5a..0d94afe6 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py @@ -9,6 +9,12 @@ def seniority(): + """ + Assign a seniority code to each row in the input CSV file + based on the role start date, agency worker status and org role. + + :return: CSV file with column SeniorityCode added. + """ # ===== Read file ===== # file = "merged_modified.csv" path = work_path.flatfile_folder @@ -109,6 +115,15 @@ def seniority(): def seniority_forecast_04(): + """ + Calculate the seniority forecast for six LEAs from 2020 to 2025. + + Reads two Excel files. The seniority forecast is calculated by multiplying the FTESum from the first file + by the population growth rate for each year and LEA from the second file + + :return: Excel file with the name seniority_forecast_04_clean.xlsx + """ + # ===== Read file ===== # file = "FTESum_2020.xlsx" requestPath = work_path.request @@ -236,6 +251,12 @@ def seniority_forecast_5c(): def progressed(): + """ + Determine whether an employee has progressed in their seniority code from the previous year + + :return: The input csv file with column called Progress added + """ + # ===== Read file ===== # file = "Seniority.csv" requestPath = work_path.request diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py index 7eea634b..85d9acbb 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py @@ -42,6 +42,12 @@ def text_collector(stream): @xml_collector def message_collector(stream): + """ + Collect messages from XML elements and yield events + + :param stream: An iterator of events from an XML parser + :yield: Events of type HeaderEvent, CSWWEvent or LALevelEvent + """ stream = peekable(stream) assert stream.peek().tag == "Message", "Expected Message, got {}".format( stream.peek().tag From 8ca8f2b99f7c0144c83151e7a41984ab9592d56b Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 1 Aug 2023 09:55:43 +0000 Subject: [PATCH 31/40] Amend docstring for seniority.py --- .../SWFtools/analysis/seniority.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py index 0d94afe6..cd5d787e 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py @@ -10,10 +10,12 @@ def seniority(): """ - Assign a seniority code to each row in the input CSV file - based on the role start date, agency worker status and org role. - - :return: CSV file with column SeniorityCode added. + Assign a seniority code to each worker in the input CSV file based on + the role start date, agency worker status and org role. + + Also add two columns indicating whether they are new and whether they left in the census year. + + :return: CSV file with columns SeniorityCode, NewOrNot and LeftOrNot added. """ # ===== Read file ===== # file = "merged_modified.csv" @@ -256,7 +258,7 @@ def progressed(): :return: The input csv file with column called Progress added """ - + # ===== Read file ===== # file = "Seniority.csv" requestPath = work_path.request From 9d64595a7a3d998074076677807bdacce4fddf7e Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 2 Aug 2023 08:33:21 +0000 Subject: [PATCH 32/40] Fix test_cleaner for incorrectly named variable --- .../SWFtools/util/work_path.py | 1 - tests/social_work_workforce/test_cleaner.py | 68 +++++++++---------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/SWFtools/util/work_path.py b/liiatools/datasets/social_work_workforce/SWFtools/util/work_path.py index c33db52c..ffa82e95 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/util/work_path.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/util/work_path.py @@ -9,7 +9,6 @@ # CSWW files must be in one "LA" folder per LA, in the cssw_folder csww_folder = os.path.join(main_folder, "samples/csww") -# print (f"csww_folder is {csww_folder}") # Flat files flatfile_folder = os.path.join(main_folder, "samples/flatfiles") diff --git a/tests/social_work_workforce/test_cleaner.py b/tests/social_work_workforce/test_cleaner.py index 9da0d6ea..e334c9b8 100644 --- a/tests/social_work_workforce/test_cleaner.py +++ b/tests/social_work_workforce/test_cleaner.py @@ -7,12 +7,12 @@ def test_clean_dates(): event = events.TextNode(text=datetime(2019, 1, 15), schema_dict={"date": "%d/%m/%Y"}) cleaned_event = list(cleaner.clean_dates(event))[0] assert cleaned_event.text == datetime(2019, 1, 15).date() - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="2019/1/15", schema_dict={"date": "%d/%m/%Y"}) cleaned_event = list(cleaner.clean_dates(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode( text=datetime(2019, 1, 15), schema_dict={"not_date": "%d/%m/%Y"} @@ -27,12 +27,12 @@ def test_clean_dates(): event = events.TextNode(text=None, schema_dict={"date": "%d/%m/%Y"}) cleaned_event = list(cleaner.clean_dates(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="", schema_dict={"date": "%d/%m/%Y"}) cleaned_event = list(cleaner.clean_dates(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" def test_clean_categories(): @@ -44,7 +44,7 @@ def test_clean_categories(): ) cleaned_event = list(cleaner.clean_categories(event))[0] assert cleaned_event.text == "0" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode( text="0.0", @@ -54,7 +54,7 @@ def test_clean_categories(): ) cleaned_event = list(cleaner.clean_categories(event))[0] assert cleaned_event.text == "0" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode( text=0, @@ -64,7 +64,7 @@ def test_clean_categories(): ) cleaned_event = list(cleaner.clean_categories(event))[0] assert cleaned_event.text == "0" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode( text="true", @@ -74,7 +74,7 @@ def test_clean_categories(): ) cleaned_event = list(cleaner.clean_categories(event))[0] assert cleaned_event.text == "1" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode( text=123, @@ -84,7 +84,7 @@ def test_clean_categories(): ) cleaned_event = list(cleaner.clean_categories(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode( text="string", @@ -94,7 +94,7 @@ def test_clean_categories(): ) cleaned_event = list(cleaner.clean_categories(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode( text="string", @@ -116,7 +116,7 @@ def test_clean_categories(): ) cleaned_event = list(cleaner.clean_categories(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode( text="", @@ -126,39 +126,39 @@ def test_clean_categories(): ) cleaned_event = list(cleaner.clean_categories(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" def test_clean_numeric_integer(): event = events.TextNode(text=123, schema_dict={"numeric": "integer"}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123 - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="", schema_dict={"numeric": "integer"}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text=None, schema_dict={"numeric": "integer"}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="123", schema_dict={"numeric": "integer"}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123 - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="string", schema_dict={"numeric": "integer"}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode(text=datetime(2017, 3, 17), schema_dict={"numeric": "integer"}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode( text=datetime(2017, 3, 17), schema_dict={"not_numeric": "integer"} @@ -171,47 +171,47 @@ def test_clean_numeric_decimal(): event = events.TextNode(text=123.45, schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123.45 - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text=123.4567, schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123.46 - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text=123.45, schema_dict={"numeric": "decimal", "decimal": 0}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123 - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text=123.456, schema_dict={"numeric": "decimal", "decimal": 6}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123.456 - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="", schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text=None, schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="123.4567", schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123.46 - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="string", schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode(text=datetime(2017, 3, 17), schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode( text=datetime(2017, 3, 17), schema_dict={"not_numeric": "decimal"} @@ -223,37 +223,37 @@ def test_clean_regex_string(): event = events.TextNode(text="AB1234567890", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "AB1234567890" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text=None, schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text=" AB1234567890 ", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "AB1234567890" - assert cleaned_event.error == "0" + assert cleaned_event.formatting_error == "0" event = events.TextNode(text="AB123456", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode(text="AB1234567890123456", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode(text="AB12345 67890", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "" - assert cleaned_event.error == "1" + assert cleaned_event.formatting_error == "1" event = events.TextNode(text="string", schema_dict={"not_regex_string": r"[A-Za-z]{2}\d{10}"}) cleaned_event = list(cleaner.clean_regex_string(event))[0] From df050ccd1c5437ae61fe3340879879403cd32600 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 2 Aug 2023 10:19:37 +0000 Subject: [PATCH 33/40] Fix SWFTools output errors --- .../SWFtools/analysis/FTESum.py | 24 ++-- .../SWFtools/analysis/growth_tables.py | 5 +- .../SWFtools/analysis/seniority.py | 119 ++++++++++-------- .../csww/NEW/social_work_workforce_2022.xml | 2 +- 4 files changed, 82 insertions(+), 68 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py index 3275bb2e..6bc8e1b8 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py @@ -1,6 +1,7 @@ import os import pandas as pd import liiatools.datasets.social_work_workforce.SWFtools.util.work_path as work_path +import liiatools.datasets.social_work_workforce.SWFtools.util.AppLogs as AppLogs def FTESum(): @@ -47,15 +48,18 @@ def FTESum_2020(): df = pd.read_csv(pathFile) df2020 = df[df["YearCensus"] == 2020] + + if df2020.empty: + AppLogs.log("FTESum_2020 error: No data for year 2020", console_output=True) + else: + df5D = df2020[["LEAName", "YearCensus", "SeniorityCode", "SeniorityName", "FTE"]] - df5D = df2020[["LEAName", "YearCensus", "SeniorityCode", "SeniorityName", "FTE"]] - - df5D = df2020.groupby( - ["LEAName", "YearCensus", "SeniorityCode", "SeniorityName"] - ).agg(FTESum=("FTE", "sum")) + df5D = df2020.groupby( + ["LEAName", "YearCensus", "SeniorityCode", "SeniorityName"] + ).agg(FTESum=("FTE", "sum")) - # ===== Save and export file ===== # - fileOutN = "FTESum_2020.xlsx" - requestPath = work_path.request - fileOut = os.path.join(requestPath, fileOutN) - df5D.to_excel(fileOut, merge_cells=False) + # ===== Save and export file ===== # + fileOutN = "FTESum_2020.xlsx" + requestPath = work_path.request + fileOut = os.path.join(requestPath, fileOutN) + df5D.to_excel(fileOut, merge_cells=False) diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py index 4ad4bab2..2147de37 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py @@ -5,6 +5,7 @@ import os import pandas as pd import liiatools.datasets.social_work_workforce.SWFtools.util.work_path as work_path +import liiatools.datasets.social_work_workforce.SWFtools.util.AppLogs as AppLogs def growth_tables(): @@ -38,7 +39,7 @@ def growth_tables(): fileOut = os.path.join(requestPath, fileOutN) growth_rate_table.to_excel(fileOut, index=False) - print("Auxiliary table: ", fileOutN, " Created") + AppLogs.log(f"Auxiliary table: {fileOutN} created", console_output=True) """ Population growth table: 2020 to 2026 @@ -70,4 +71,4 @@ def growth_tables(): fileOut = os.path.join(requestPath, fileOutN) population_growth_table.to_excel(fileOut, index=False) - print("Auxiliary table: ", fileOutN, " Created") + AppLogs.log(f"Auxiliary table: {fileOutN} created", console_output=True) diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py index cd5d787e..df4b2677 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py @@ -6,6 +6,7 @@ SENIORITY_CODE_DICT, ) import liiatools.datasets.social_work_workforce.SWFtools.util.work_path as work_path +import liiatools.datasets.social_work_workforce.SWFtools.util.AppLogs as AppLogs def seniority(): @@ -130,61 +131,69 @@ def seniority_forecast_04(): file = "FTESum_2020.xlsx" requestPath = work_path.request pathFile = os.path.join(requestPath, file) - dfSen = pd.read_excel(pathFile) - - # ===== Rename column ===== # - dfSen.rename(columns={"FTESum": "2020"}, inplace=True) - - # ===== Read file ===== # - file = "population_growth_table.xlsx" - requestPath = work_path.request - pathFile = os.path.join(requestPath, file) - p_df = pd.read_excel(pathFile) - - countYearBefore = 2019 - countYearNext = 2020 - for count in range(5): - countYearBefore = countYearBefore + 1 - countYearNext = countYearNext + 1 - # Havering - dfSen.loc[dfSen["LEAName"] == "Havering", str(countYearNext)] = ( - dfSen[str(countYearBefore)] / p_df.loc[0, str(countYearBefore)] - ) * p_df.loc[0, str(countYearNext)] - # Barking and Dagenham - dfSen.loc[dfSen["LEAName"] == "Barking and Dagenham", str(countYearNext)] = ( - dfSen[str(countYearBefore)] / p_df.loc[1, str(countYearBefore)] - ) * p_df.loc[1, str(countYearNext)] - # Redbridge - dfSen.loc[dfSen["LEAName"] == "Redbridge", str(countYearNext)] = ( - dfSen[str(countYearBefore)] / p_df.loc[2, str(countYearBefore)] - ) * p_df.loc[2, str(countYearNext)] - # Newham - dfSen.loc[dfSen["LEAName"] == "Newham", str(countYearNext)] = ( - dfSen[str(countYearBefore)] / p_df.loc[3, str(countYearBefore)] - ) * p_df.loc[3, str(countYearNext)] - # Tower Hamlets - dfSen.loc[dfSen["LEAName"] == "Tower Hamlets", str(countYearNext)] = ( - dfSen[str(countYearBefore)] / p_df.loc[4, str(countYearBefore)] - ) * p_df.loc[4, str(countYearNext)] - # Waltham Forest - dfSen.loc[dfSen["LEAName"] == "Waltham Forest", str(countYearNext)] = ( - dfSen[str(countYearBefore)] / p_df.loc[5, str(countYearBefore)] - ) * p_df.loc[5, str(countYearNext)] - - dfSen["2020"] = dfSen["2020"].round(3) - dfSen["2021"] = dfSen["2021"].round(3) - dfSen["2022"] = dfSen["2022"].round(3) - dfSen["2023"] = dfSen["2023"].round(3) - dfSen["2024"] = dfSen["2024"].round(3) - dfSen["2025"] = dfSen["2025"].round(3) - - dfSen = dfSen.drop(["YearCensus"], axis=1) - - # ===== Save and export file ===== # - fileOutN = "seniority_forecast_04_clean.xlsx" - requestPath = work_path.request - fileOut = os.path.join(requestPath, fileOutN) - dfSen.to_excel(fileOut, index=False, merge_cells=False) + try: + dfSen = pd.read_excel(pathFile) + except FileNotFoundError as e: + AppLogs.log(f"FileNotFoundError: {e.filename}", console_output=True) + return + + + if dfSen.empty: + AppLogs.log("seniority_forecast_04 error: No data in FTESum_2020.xlsx", console_output=True) + else: + # ===== Rename column ===== # + dfSen.rename(columns={"FTESum": "2020"}, inplace=True) + + # ===== Read file ===== # + file = "population_growth_table.xlsx" + requestPath = work_path.request + pathFile = os.path.join(requestPath, file) + p_df = pd.read_excel(pathFile) + + countYearBefore = 2019 + countYearNext = 2020 + for count in range(5): + countYearBefore = countYearBefore + 1 + countYearNext = countYearNext + 1 + # Havering + dfSen.loc[dfSen["LEAName"] == "Havering", str(countYearNext)] = ( + dfSen[str(countYearBefore)] / p_df.loc[0, str(countYearBefore)] + ) * p_df.loc[0, str(countYearNext)] + # Barking and Dagenham + dfSen.loc[dfSen["LEAName"] == "Barking and Dagenham", str(countYearNext)] = ( + dfSen[str(countYearBefore)] / p_df.loc[1, str(countYearBefore)] + ) * p_df.loc[1, str(countYearNext)] + # Redbridge + dfSen.loc[dfSen["LEAName"] == "Redbridge", str(countYearNext)] = ( + dfSen[str(countYearBefore)] / p_df.loc[2, str(countYearBefore)] + ) * p_df.loc[2, str(countYearNext)] + # Newham + dfSen.loc[dfSen["LEAName"] == "Newham", str(countYearNext)] = ( + dfSen[str(countYearBefore)] / p_df.loc[3, str(countYearBefore)] + ) * p_df.loc[3, str(countYearNext)] + # Tower Hamlets + dfSen.loc[dfSen["LEAName"] == "Tower Hamlets", str(countYearNext)] = ( + dfSen[str(countYearBefore)] / p_df.loc[4, str(countYearBefore)] + ) * p_df.loc[4, str(countYearNext)] + # Waltham Forest + dfSen.loc[dfSen["LEAName"] == "Waltham Forest", str(countYearNext)] = ( + dfSen[str(countYearBefore)] / p_df.loc[5, str(countYearBefore)] + ) * p_df.loc[5, str(countYearNext)] + + dfSen["2020"] = dfSen["2020"].round(3) + dfSen["2021"] = dfSen["2021"].round(3) + dfSen["2022"] = dfSen["2022"].round(3) + dfSen["2023"] = dfSen["2023"].round(3) + dfSen["2024"] = dfSen["2024"].round(3) + dfSen["2025"] = dfSen["2025"].round(3) + + dfSen = dfSen.drop(["YearCensus"], axis=1) + + # ===== Save and export file ===== # + fileOutN = "seniority_forecast_04_clean.xlsx" + requestPath = work_path.request + fileOut = os.path.join(requestPath, fileOutN) + dfSen.to_excel(fileOut, index=False, merge_cells=False) def seniority_forecast_5c(): diff --git a/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml b/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml index 3cb567fa..8c3fc4a5 100644 --- a/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml +++ b/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml @@ -7,7 +7,7 @@ L - 314 + 316 liiatools.datasets.social_work_workforce.sample_data 2023-03-28T14:54:55Z From 0014c13cc09cc3fbca5b9d1cb0d2f37272c99c2c Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Thu, 3 Aug 2023 08:48:23 +0000 Subject: [PATCH 34/40] Add unit tests for csww_record --- .../lds_csww_clean/csww_record.py | 10 ++- .../social_work_workforce/test_csww_record.py | 80 +++++++++++++++++++ 2 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 tests/social_work_workforce/test_csww_record.py diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py index 85d9acbb..85d058ff 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py @@ -30,6 +30,12 @@ def _reduce_dict(dict_instance): @xml_collector def text_collector(stream): + """ + Create a dictionary of text values for each element + + :param stream: An iterator of events from an XML parser + :return: Dictionary containing element name and text values + """ data_dict = {} current_element = None for event in stream: @@ -49,9 +55,7 @@ def message_collector(stream): :yield: Events of type HeaderEvent, CSWWEvent or LALevelEvent """ stream = peekable(stream) - assert stream.peek().tag == "Message", "Expected Message, got {}".format( - stream.peek().tag - ) + assert stream.peek().tag == "Message", f"Expected Message, got {stream.peek().tag}" while stream: event = stream.peek() if event.get("tag") == "Header": diff --git a/tests/social_work_workforce/test_csww_record.py b/tests/social_work_workforce/test_csww_record.py new file mode 100644 index 00000000..88cd3890 --- /dev/null +++ b/tests/social_work_workforce/test_csww_record.py @@ -0,0 +1,80 @@ +# Import the unittest module and the code to be tested +import unittest +from sfdata_stream_parser.events import StartElement, EndElement, TextNode +from liiatools.datasets.social_work_workforce.lds_csww_clean.csww_record import ( + text_collector, + message_collector, + CSWWEvent, + LALevelEvent, + HeaderEvent, +) +from liiatools.datasets.social_work_workforce.lds_csww_clean.xml import dom_parse + + +class TestRecord(unittest.TestCase): + def generate_text_element(self, tag: str, text): + """ + Create a complete TextNode sandwiched between a StartElement and EndElement + + :param tag: XML tag + :param text: text to be stored in the given XML tag, could be a string, integer, float etc. + :return: StartElement and EndElement with given tags and TextNode with given text + """ + yield StartElement(tag=tag) + yield TextNode(text=str(text)) + yield EndElement(tag=tag) + + def generate_test_csww_file(self): + """ + Generate a sample children's social work workforce census file + + :return: stream of generators containing information required to create an XML file + """ + yield StartElement(tag="Message") + yield StartElement(tag="Header") + yield from self.generate_text_element(tag="Version", text=1) + yield EndElement(tag="Header") + yield StartElement(tag="LALevelVacancies") + yield from self.generate_text_element(tag="NumberOfVacancies", text=100) + yield EndElement(tag="LALevelVacancies") + yield StartElement(tag="CSWWWorker") + yield from self.generate_text_element(tag="ID", text=100) + yield from self.generate_text_element(tag="SWENo", text="AB123456789") + yield from self.generate_text_element(tag="Agency", text=0) + yield EndElement(tag="CSWWWorker") + yield EndElement(tag="Message") + + def test_text_collector(self): + # test that the text_collector returns a dictionary of events and their text values from the stream + test_stream = self.generate_test_csww_file() + test_record = text_collector(test_stream) + self.assertEqual(len(test_record), 5) + self.assertEqual( + test_record, + { + "Version": "1", + "NumberOfVacancies": "100", + "ID": "100", + "SWENo": "AB123456789", + "Agency": "0", + }, + ) + + def test_message_collector(self): + # test that the message_collector yields events of the correct type from the stream + test_stream = self.generate_test_csww_file() + test_events = list(message_collector(test_stream)) + self.assertEqual(len(test_events), 3) + self.assertIsInstance(test_events[0], HeaderEvent) + self.assertEqual(test_events[0].record, {"Version": "1"}) + self.assertIsInstance(test_events[1], LALevelEvent) + self.assertEqual(test_events[1].record, {"NumberOfVacancies": "100"}) + self.assertIsInstance(test_events[2], CSWWEvent) + self.assertEqual( + test_events[2].record, {"ID": "100", "SWENo": "AB123456789", "Agency": "0"} + ) + + +# Run the tests +if __name__ == "__main__": + unittest.main() From e63c1051a3c6447e7bbb44a5a3432997133bea89 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Fri, 25 Aug 2023 14:32:43 +0000 Subject: [PATCH 35/40] Rewrite to_numeric in converters and cleaner --- .../lds_csww_clean/cleaner.py | 11 ++-- .../lds_csww_clean/converters.py | 62 +++++++------------ tests/social_work_workforce/test_cleaner.py | 7 +-- .../social_work_workforce/test_converters.py | 40 ++++++------ 4 files changed, 50 insertions(+), 70 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index fb4e857f..5e1e51b2 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -6,8 +6,7 @@ from liiatools.datasets.social_work_workforce.lds_csww_clean.converters import ( to_category, - to_integer, - to_decimal, + to_numeric, to_regex, ) @@ -73,10 +72,14 @@ def clean_numeric(event): numeric = event.schema_dict["numeric"] try: if numeric == "integer": - clean_text = to_integer(event.text, numeric) + clean_text = to_numeric(value=event.text, config=numeric) elif numeric == "decimal": decimal_places = int(event.schema_dict["decimal"]) - clean_text = to_decimal(event.text, numeric, decimal_places) + # min_inclusive = event.schema_dict["min_inclusive"] + # print(f"min_inclusive = {min_inclusive}") + # max_inclusive = event.schema_dict["max_inclusive"] + # print(f"max_inclusive = {max_inclusive}") + clean_text = to_numeric(value=event.text, config=numeric, decimal_places=decimal_places) # min_inclusive=min_inclusive, max_inclusive=max_inclusive if clean_text != "error": return event.from_event(event, text=clean_text, formatting_error="0") return event.from_event(event, text="", formatting_error="1") diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py index f3ba9ddc..f2009770 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py @@ -31,14 +31,26 @@ def to_category(string, categories): return "error" -def to_integer(value, config): +def to_numeric(value, config, decimal_places=0): # min_inclusive=None, max_inclusive=None """ - Convert any strings that should be integers based on the config into integers + Convert any strings that should be integer or decimal based on the config into integer or decimal - :param value: Some value to convert to an integer + :param value: Some value to convert to an integer or decimal :param config: The loaded configuration - :return: Either an integer value or an "error" string if value could not be formatted as integer or a blank string if no value provided + :param dec_places: The number of decimal places to apply (default 0) + :param min_inclusive: Minimum value allowed (default none) + :param max_inclusive: Maximum value allowed (default none) + :return: Either an integer, a decimal value formatted to number of decimal places or an "error" string if value could not be formatted as decimal or a blank string if no value provided """ + if config == "decimal": + if value or value == 0: + try: + float(value) + round_to_dp = round(float(value), decimal_places) + return round_to_dp + except (ValueError, TypeError): + return "error" # value incorrectly formatted + return "" # no value provided if config == "integer": if value or value==0: if isinstance(value, str) and value[-2:] == ".0": @@ -52,27 +64,6 @@ def to_integer(value, config): return value -def to_decimal(value, config, decimal_places=0): - """ - Convert any strings that should be decimal based on the config into decimals - - :param value: Some value to convert to a decimal - :param config: The loaded configuration - :param dec_places: The number of decimal places to apply (default 0) - :return: Either a decimal value formatted to number of decimal places or an "error" string if value could not be formatted as decimal or a blank string if no value provided - """ - if config == "decimal": - if value or value == 0: - try: - float(value) - round_to_dp = round(float(value), decimal_places) - return round_to_dp - except (ValueError, TypeError): - return "error" # value incorrectly formatted - return "" # no value provided - return value - - def to_regex(value, pattern): """ Convert any strings that should conform to regex pattern based on the schema into regex string @@ -81,18 +72,11 @@ def to_regex(value, pattern): :param pattern: The regex pattern to compare :return: Either a string matching the regex pattern or an "error" string if value does not match pattern or a blank string if no value provided """ - if pattern: - if value: - stripped_value = value.strip() - try: - isfullmatch = re.fullmatch(pattern, stripped_value) - if isfullmatch: - return stripped_value - else: - return "error" # value does not match regex pattern - except (ValueError, TypeError): - return "error" # value incorrectly formatted - else: - return "" # no value provided + if value: + stripped_value = value.strip() + isfullmatch = re.fullmatch(pattern, stripped_value) + if isfullmatch: + return stripped_value + return "error" # value does not match regex pattern else: - return value \ No newline at end of file + return "" # no value provided \ No newline at end of file diff --git a/tests/social_work_workforce/test_cleaner.py b/tests/social_work_workforce/test_cleaner.py index e334c9b8..6f4992c1 100644 --- a/tests/social_work_workforce/test_cleaner.py +++ b/tests/social_work_workforce/test_cleaner.py @@ -129,7 +129,7 @@ def test_clean_categories(): assert cleaned_event.formatting_error == "0" -def test_clean_numeric_integer(): +def test_clean_numeric(): event = events.TextNode(text=123, schema_dict={"numeric": "integer"}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123 @@ -166,8 +166,6 @@ def test_clean_numeric_integer(): cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == datetime(2017, 3, 17) - -def test_clean_numeric_decimal(): event = events.TextNode(text=123.45, schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123.45 @@ -262,7 +260,6 @@ def test_clean_regex_string(): # test_clean_dates() # test_clean_categories() -# test_clean_numeric_integer() -# test_clean_numeric_decimal() +# test_clean_numeric() # test_clean_regex_string() diff --git a/tests/social_work_workforce/test_converters.py b/tests/social_work_workforce/test_converters.py index 88b8ff1d..36aed7ce 100644 --- a/tests/social_work_workforce/test_converters.py +++ b/tests/social_work_workforce/test_converters.py @@ -26,28 +26,25 @@ def test_to_category(): assert converters.to_category(None, category_dict) == "" -def test_to_integer(): - assert converters.to_integer("3000", "integer") == 3000 - assert converters.to_integer(123, "integer") == 123 - assert converters.to_integer("1.0", "integer") == 1 - assert converters.to_integer("date", "") == "date" - assert converters.to_integer(123.456, "integer") == 123 - assert converters.to_integer(0, "integer") == 0 - assert converters.to_integer("", "integer") == "" - assert converters.to_integer(None, "integer") == "" - - -def test_to_decimal(): +def test_to_numeric(): decimal_places = 3 - assert converters.to_decimal("12.345", "decimal", decimal_places) == 12.345 - assert converters.to_decimal("12.3456", "decimal", decimal_places) == 12.346 - assert converters.to_decimal("12.3", "decimal", decimal_places) == 12.3 - assert converters.to_decimal(12.3456, "decimal", decimal_places) == 12.346 - assert converters.to_decimal("1.0", "decimal", decimal_places) == 1 - assert converters.to_decimal(0, "decimal", decimal_places) == 0 - assert converters.to_decimal("date", "") == "date" - assert converters.to_decimal("", "decimal", decimal_places) == "" - assert converters.to_decimal(None, "decimal", decimal_places) == "" + assert converters.to_numeric("12.345", "decimal", decimal_places) == 12.345 + assert converters.to_numeric("12.3456", "decimal", decimal_places) == 12.346 + assert converters.to_numeric("12.3", "decimal", decimal_places) == 12.3 + assert converters.to_numeric(12.3456, "decimal", decimal_places) == 12.346 + assert converters.to_numeric("1.0", "decimal", decimal_places) == 1 + assert converters.to_numeric(0, "decimal", decimal_places) == 0 + assert converters.to_numeric("date", "") == "date" + assert converters.to_numeric("", "decimal", decimal_places) == "" + assert converters.to_numeric(None, "decimal", decimal_places) == "" + assert converters.to_numeric("3000", "integer") == 3000 + assert converters.to_numeric(123, "integer") == 123 + assert converters.to_numeric("1.0", "integer") == 1 + assert converters.to_numeric("date", "") == "date" + assert converters.to_numeric(123.456, "integer") == 123 + assert converters.to_numeric(0, "integer") == 0 + assert converters.to_numeric("", "integer") == "" + assert converters.to_numeric(None, "integer") == "" def test_to_regex(): @@ -57,7 +54,6 @@ def test_to_regex(): assert converters.to_regex("AB1234567890123456",pattern) == "error" # too long assert converters.to_regex("AB12345",pattern) == "error" # too short assert converters.to_regex("xxxxOz2054309383",pattern) == "error" # invalid format - assert converters.to_regex("date", "") == "date" # no pattern assert converters.to_regex("", pattern) == "" # no value assert converters.to_regex(None, pattern) == "" # no value From 3f3313d9e6af74a8015c8995031c04972014a142 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 30 Aug 2023 09:43:36 +0000 Subject: [PATCH 36/40] Implement min/max in to_numeric --- .../lds_csww_clean/cleaner.py | 8 +++----- .../lds_csww_clean/converters.py | 10 ++++++---- tests/social_work_workforce/test_cleaner.py | 19 +++++++++++++++++-- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index 5e1e51b2..57acf184 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -75,11 +75,9 @@ def clean_numeric(event): clean_text = to_numeric(value=event.text, config=numeric) elif numeric == "decimal": decimal_places = int(event.schema_dict["decimal"]) - # min_inclusive = event.schema_dict["min_inclusive"] - # print(f"min_inclusive = {min_inclusive}") - # max_inclusive = event.schema_dict["max_inclusive"] - # print(f"max_inclusive = {max_inclusive}") - clean_text = to_numeric(value=event.text, config=numeric, decimal_places=decimal_places) # min_inclusive=min_inclusive, max_inclusive=max_inclusive + min_inclusive = event.schema_dict.get("min_inclusive", None) + max_inclusive = event.schema_dict.get("max_inclusive", None) + clean_text = to_numeric(value=event.text, config=numeric, decimal_places=decimal_places, min_inclusive=min_inclusive, max_inclusive=max_inclusive) if clean_text != "error": return event.from_event(event, text=clean_text, formatting_error="0") return event.from_event(event, text="", formatting_error="1") diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py index f2009770..9c813a7d 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py @@ -31,7 +31,7 @@ def to_category(string, categories): return "error" -def to_numeric(value, config, decimal_places=0): # min_inclusive=None, max_inclusive=None +def to_numeric(value, config, decimal_places=0, min_inclusive=None, max_inclusive=None): """ Convert any strings that should be integer or decimal based on the config into integer or decimal @@ -46,10 +46,12 @@ def to_numeric(value, config, decimal_places=0): # min_inclusive=None, max_inclu if value or value == 0: try: float(value) - round_to_dp = round(float(value), decimal_places) - return round_to_dp except (ValueError, TypeError): - return "error" # value incorrectly formatted + return "error" # value is not a float + round_to_dp = round(float(value), decimal_places) + if (min_inclusive is None or round_to_dp >= min_inclusive) and (max_inclusive is None or round_to_dp <= max_inclusive): + return round_to_dp + return "error" # min/max error return "" # no value provided if config == "integer": if value or value==0: diff --git a/tests/social_work_workforce/test_cleaner.py b/tests/social_work_workforce/test_cleaner.py index 6f4992c1..8bd6c22d 100644 --- a/tests/social_work_workforce/test_cleaner.py +++ b/tests/social_work_workforce/test_cleaner.py @@ -201,15 +201,30 @@ def test_clean_numeric(): assert cleaned_event.text == 123.46 assert cleaned_event.formatting_error == "0" + event = events.TextNode(text="0.45", schema_dict={"numeric": "decimal", "decimal": 2, "min_inclusive": 0, "max_inclusive": 1}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == 0.45 + assert cleaned_event.formatting_error == "0" + + event = events.TextNode(text="1.99", schema_dict={"numeric": "decimal", "decimal": 2, "min_inclusive": 0, "max_inclusive": 1}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.formatting_error == "1" # exceeds maximum value + + event = events.TextNode(text="0.50", schema_dict={"numeric": "decimal", "decimal": 2, "min_inclusive": 1, "max_inclusive": 9}) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.formatting_error == "1" # less than minimum value + event = events.TextNode(text="string", schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.formatting_error == "1" + assert cleaned_event.formatting_error == "1" # not a decimal event = events.TextNode(text=datetime(2017, 3, 17), schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.formatting_error == "1" + assert cleaned_event.formatting_error == "1" # not a decimal event = events.TextNode( text=datetime(2017, 3, 17), schema_dict={"not_numeric": "decimal"} From 23b511b06def2be74037d900a8646cee3c761020 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 30 Aug 2023 10:02:32 +0000 Subject: [PATCH 37/40] Add tests for min/max to_numeric --- tests/social_work_workforce/test_converters.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/social_work_workforce/test_converters.py b/tests/social_work_workforce/test_converters.py index 36aed7ce..a4526ebe 100644 --- a/tests/social_work_workforce/test_converters.py +++ b/tests/social_work_workforce/test_converters.py @@ -37,6 +37,11 @@ def test_to_numeric(): assert converters.to_numeric("date", "") == "date" assert converters.to_numeric("", "decimal", decimal_places) == "" assert converters.to_numeric(None, "decimal", decimal_places) == "" + assert converters.to_numeric("0.3", "decimal", decimal_places, min_inclusive=0, max_inclusive=1) == 0.3 + assert converters.to_numeric("0.3", "decimal", decimal_places, min_inclusive=0) == 0.3 + assert converters.to_numeric("0.3", "decimal", decimal_places, max_inclusive=1) == 0.3 + assert converters.to_numeric("1.99", "decimal", decimal_places, min_inclusive=0, max_inclusive=1) == "error" + assert converters.to_numeric("0.3", "decimal", decimal_places, min_inclusive=1, max_inclusive=99) == "error" assert converters.to_numeric("3000", "integer") == 3000 assert converters.to_numeric(123, "integer") == 123 assert converters.to_numeric("1.0", "integer") == 1 From 72f649da506694d8ef57652d283ef900211205d3 Mon Sep 17 00:00:00 2001 From: patrick-troy <58770937+patrick-troy@users.noreply.github.com> Date: Fri, 1 Sep 2023 15:51:06 +0100 Subject: [PATCH 38/40] add validation, fix minor errors, run python black --- liiatools/datasets/cin_census/cin_cli.py | 9 +- .../cin_census/lds_cin_clean/filters.py | 9 +- .../cin_census/lds_cin_clean/logger.py | 12 +- .../cin_census/lds_cin_la_agg/process.py | 2 +- .../cin_census/lds_cin_pan_agg/process.py | 2 +- liiatools/datasets/shared_functions/common.py | 20 ++- .../SWFtools/analysis/FTESum.py | 8 +- .../SWFtools/analysis/growth_tables.py | 2 +- .../SWFtools/analysis/seniority.py | 14 +- .../social_work_workforce/csww_cli.py | 3 +- .../csww_main_functions.py | 37 ++---- .../lds_csww_clean/cleaner.py | 19 +-- .../lds_csww_clean/converters.py | 39 +++--- .../lds_csww_clean/csww_record.py | 8 +- .../lds_csww_clean/file_creator.py | 10 +- .../lds_csww_clean/filters.py | 89 ++----------- .../lds_csww_clean/logger.py | 51 +++++-- .../lds_csww_clean/schema.py | 5 +- .../lds_csww_clean/validator.py | 63 +++++++++ .../lds_csww_clean/xml.py | 49 ------- .../lds_csww_data_generator/stream.py | 2 +- .../spec/social_work_workforce/la-agg.yml | 6 + .../spec/social_work_workforce/pan-agg.yml | 6 + tests/cin_census/test_converter.py | 4 +- tests/cin_census/test_file_creator.py | 1 - tests/cin_census/test_schema.py | 4 +- tests/common/test_common.py | 12 +- tests/s903/test_file_creator.py | 102 ++++++++++---- tests/s903/test_populate.py | 4 +- tests/social_work_workforce/test_cleaner.py | 124 +++++++++++++----- .../social_work_workforce/test_converters.py | 51 ++++--- .../social_work_workforce/test_csww_record.py | 7 - tests/social_work_workforce/test_logger.py | 45 +++++-- 33 files changed, 471 insertions(+), 348 deletions(-) create mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/validator.py delete mode 100644 liiatools/datasets/social_work_workforce/lds_csww_clean/xml.py diff --git a/liiatools/datasets/cin_census/cin_cli.py b/liiatools/datasets/cin_census/cin_cli.py index 4648bcef..36d36400 100644 --- a/liiatools/datasets/cin_census/cin_cli.py +++ b/liiatools/datasets/cin_census/cin_cli.py @@ -28,7 +28,7 @@ check_year, check_year_within_range, save_year_error, - save_incorrect_year_error + save_incorrect_year_error, ) # Dependencies for la_agg() @@ -118,7 +118,12 @@ def cleanfile(input, la_code, la_log_dir, output): years_to_go_back = 6 year_start_month = 6 reference_date = datetime.now() - if check_year_within_range(input_year, years_to_go_back, year_start_month, reference_date) is False: + if ( + check_year_within_range( + input_year, years_to_go_back, year_start_month, reference_date + ) + is False + ): save_incorrect_year_error(input, la_log_dir) return diff --git a/liiatools/datasets/cin_census/lds_cin_clean/filters.py b/liiatools/datasets/cin_census/lds_cin_clean/filters.py index fe3d48e5..93909677 100644 --- a/liiatools/datasets/cin_census/lds_cin_clean/filters.py +++ b/liiatools/datasets/cin_census/lds_cin_clean/filters.py @@ -81,7 +81,10 @@ def _create_category_dict(field: str, file: str): documentation = element.findall(search_doc) for i, d in enumerate(documentation): name_dict = {"name": d.text} - category_dict["category"][i] = {**category_dict["category"][i], **name_dict} + category_dict["category"][i] = { + **category_dict["category"][i], + **name_dict, + } return category_dict @@ -99,7 +102,9 @@ def _create_float_dict(field: str, file: str): search_restriction = f".//{{http://www.w3.org/2001/XMLSchema}}restriction" restriction = element.findall(search_restriction) for r in restriction: - code_dict = {"numeric": r.get("base")[3:]} # Remove the "xs:" from the start of the base string + code_dict = { + "numeric": r.get("base")[3:] + } # Remove the "xs:" from the start of the base string if code_dict["numeric"] == "decimal": float_dict = code_dict diff --git a/liiatools/datasets/cin_census/lds_cin_clean/logger.py b/liiatools/datasets/cin_census/lds_cin_clean/logger.py index 92198036..d37268ef 100644 --- a/liiatools/datasets/cin_census/lds_cin_clean/logger.py +++ b/liiatools/datasets/cin_census/lds_cin_clean/logger.py @@ -66,9 +66,7 @@ def counter(event, counter_check, value_error, structural_error, blank_error): ) else: if hasattr(event, "validation_message"): - blank_error.append( - f"LAchildID: blank, Node: {event.schema.name}" - ) + blank_error.append(f"LAchildID: blank, Node: {event.schema.name}") elif hasattr(event.schema, "name"): value_error.append(f"LAchildID: blank, Node: {event.schema.name}") else: @@ -100,13 +98,7 @@ def save_errors_la( """ filename = str(Path(input).resolve().stem) start_time = f"{datetime.now():%Y-%m-%dT%H%M%SZ}" - if ( - value_error - or structural_error - or field_error - or blank_error - or LAchildID_error - ): + if value_error or structural_error or field_error or blank_error or LAchildID_error: with open( f"{Path(la_log_dir, filename)}_error_log_{start_time}.txt", "a", diff --git a/liiatools/datasets/cin_census/lds_cin_la_agg/process.py b/liiatools/datasets/cin_census/lds_cin_la_agg/process.py index fd328637..370e96c1 100644 --- a/liiatools/datasets/cin_census/lds_cin_la_agg/process.py +++ b/liiatools/datasets/cin_census/lds_cin_la_agg/process.py @@ -109,7 +109,7 @@ def _time_between_date_series(later_date_series, earlier_date_series, years=0, d elif years == 1: years_series = (days_series / 365).apply(np.floor) - years_series = years_series.astype('Int32') + years_series = years_series.astype("Int32") return years_series diff --git a/liiatools/datasets/cin_census/lds_cin_pan_agg/process.py b/liiatools/datasets/cin_census/lds_cin_pan_agg/process.py index ea692096..e8879f10 100644 --- a/liiatools/datasets/cin_census/lds_cin_pan_agg/process.py +++ b/liiatools/datasets/cin_census/lds_cin_pan_agg/process.py @@ -94,7 +94,7 @@ def _time_between_date_series(later_date_series, earlier_date_series, years=0, d elif years == 1: years_series = (days_series / 365).apply(np.floor) - years_series = years_series.astype('Int32') + years_series = years_series.astype("Int32") return years_series diff --git a/liiatools/datasets/shared_functions/common.py b/liiatools/datasets/shared_functions/common.py index be019df9..3095d2e9 100644 --- a/liiatools/datasets/shared_functions/common.py +++ b/liiatools/datasets/shared_functions/common.py @@ -83,7 +83,7 @@ def save_year_error(input, la_log_dir): :param la_log_dir: Path to the local authority's log folder :return: Text file containing the error information """ - + filename = Path(input).resolve().stem start_time = f"{datetime.now():%d-%m-%Y %Hh-%Mm-%Ss}" with open( @@ -93,8 +93,8 @@ def save_year_error(input, la_log_dir): f.write( f"Could not process '{filename}' because no year was found in the name of the file" ) - - + + def check_year_within_range(year, num_of_years, new_year_start_month, as_at_date): """ Check that year is within permitted range of data retention policy @@ -113,13 +113,13 @@ def check_year_within_range(year, num_of_years, new_year_start_month, as_at_date current_month = as_at_date.month if current_month < new_year_start_month: earliest_allowed_year = current_year - num_of_years - latest_allowed_year = current_year + latest_allowed_year = current_year else: earliest_allowed_year = current_year - num_of_years + 1 # roll forward one year latest_allowed_year = current_year + 1 return earliest_allowed_year <= year_to_check <= latest_allowed_year - + def save_incorrect_year_error(input, la_log_dir): """ @@ -177,10 +177,16 @@ def check_year(filename): fy_match = re.search(r"(\d{2})(.{0,3}\d{2})(.*)(\d*)", filename) if fy_match: - if len(fy_match.group(2)) == 2 and int(fy_match.group(2)) == int(fy_match.group(1)) + 1: + if ( + len(fy_match.group(2)) == 2 + and int(fy_match.group(2)) == int(fy_match.group(1)) + 1 + ): year = "20" + fy_match.group(2) return year - if len(fy_match.group(2)) == 3 and int(fy_match.group(2)[-2:]) == int(fy_match.group(1)) + 1: + if ( + len(fy_match.group(2)) == 3 + and int(fy_match.group(2)[-2:]) == int(fy_match.group(1)) + 1 + ): year = "20" + fy_match.group(2)[-2:] return year if int(fy_match.group(3)[1:3]) == int(fy_match.group(2)[-2:]) + 1: diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py index 6bc8e1b8..ecafe1b5 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/FTESum.py @@ -40,7 +40,7 @@ def FTESum_2020(): :return: Excel file with the name FTESum_2020.xlsx and the same path as the input file """ - + # ===== Read file ===== # file = "CompMergSen.csv" requestPath = work_path.request @@ -48,11 +48,13 @@ def FTESum_2020(): df = pd.read_csv(pathFile) df2020 = df[df["YearCensus"] == 2020] - + if df2020.empty: AppLogs.log("FTESum_2020 error: No data for year 2020", console_output=True) else: - df5D = df2020[["LEAName", "YearCensus", "SeniorityCode", "SeniorityName", "FTE"]] + df5D = df2020[ + ["LEAName", "YearCensus", "SeniorityCode", "SeniorityName", "FTE"] + ] df5D = df2020.groupby( ["LEAName", "YearCensus", "SeniorityCode", "SeniorityName"] diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py index 2147de37..def4cbf2 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/growth_tables.py @@ -12,7 +12,7 @@ def growth_tables(): """ Create two Excel files with tables of growth rates and population growth for six LEAs """ - + growth_rate_df = { "LEAName": [ "Havering", diff --git a/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py b/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py index df4b2677..1d1d5236 100644 --- a/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py +++ b/liiatools/datasets/social_work_workforce/SWFtools/analysis/seniority.py @@ -137,9 +137,11 @@ def seniority_forecast_04(): AppLogs.log(f"FileNotFoundError: {e.filename}", console_output=True) return - if dfSen.empty: - AppLogs.log("seniority_forecast_04 error: No data in FTESum_2020.xlsx", console_output=True) + AppLogs.log( + "seniority_forecast_04 error: No data in FTESum_2020.xlsx", + console_output=True, + ) else: # ===== Rename column ===== # dfSen.rename(columns={"FTESum": "2020"}, inplace=True) @@ -160,9 +162,13 @@ def seniority_forecast_04(): dfSen[str(countYearBefore)] / p_df.loc[0, str(countYearBefore)] ) * p_df.loc[0, str(countYearNext)] # Barking and Dagenham - dfSen.loc[dfSen["LEAName"] == "Barking and Dagenham", str(countYearNext)] = ( + dfSen.loc[ + dfSen["LEAName"] == "Barking and Dagenham", str(countYearNext) + ] = ( dfSen[str(countYearBefore)] / p_df.loc[1, str(countYearBefore)] - ) * p_df.loc[1, str(countYearNext)] + ) * p_df.loc[ + 1, str(countYearNext) + ] # Redbridge dfSen.loc[dfSen["LEAName"] == "Redbridge", str(countYearNext)] = ( dfSen[str(countYearBefore)] / p_df.loc[2, str(countYearBefore)] diff --git a/liiatools/datasets/social_work_workforce/csww_cli.py b/liiatools/datasets/social_work_workforce/csww_cli.py index 99f7ca33..486e132c 100644 --- a/liiatools/datasets/social_work_workforce/csww_cli.py +++ b/liiatools/datasets/social_work_workforce/csww_cli.py @@ -24,6 +24,7 @@ def csww(): """ pass + @csww.command() @click.option( "--i", @@ -139,4 +140,4 @@ def pan_agg(input, la_code, output): :param output: should specify the path to the output folder :return: None """ - csww_main_functions.pan_agg(input, la_code, output) \ No newline at end of file + csww_main_functions.pan_agg(input, la_code, output) diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 63d599e2..6b88566f 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -2,8 +2,13 @@ from datetime import datetime import yaml -from liiatools.datasets.social_work_workforce.lds_csww_data_generator.sample_data import generate_sample_csww_file -from liiatools.datasets.social_work_workforce.lds_csww_data_generator.stream import consume +# Dependencies for generate_sample() +from liiatools.datasets.social_work_workforce.lds_csww_data_generator.sample_data import ( + generate_sample_csww_file, +) +from liiatools.datasets.social_work_workforce.lds_csww_data_generator.stream import ( + consume, +) # Dependencies for cleanfile() from liiatools.datasets.social_work_workforce.lds_csww_clean.parse import ( @@ -23,6 +28,7 @@ cleaner, logger, filters, + validator as clean_validator, ) from liiatools.spec import common as common_asset_dir @@ -133,6 +139,7 @@ def cleanfile(input, la_code, la_log_dir, output): # Clean stream stream = cleaner.clean(stream) + stream = clean_validator.validate_elements(stream) stream = logger.log_errors(stream) # Output results @@ -212,29 +219,3 @@ def pan_agg(input, la_code, output): la_name = flip_dict(config["data_codes"])[la_code] csww_df = pan_process.merge_agg_files(output, table_name, csww_df, la_name) pan_process.export_pan_file(output, table_name, csww_df) - - -# Run in Visual Studio Code |> - -# cleanfile( -# "/workspaces/liia-tools/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022_sc.xml", -# "BAD", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# ) - -# la_agg( -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_worker_clean.csv", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# ) - -# la_agg( -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/social_work_workforce_2022_lalevel_clean.csv", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# ) - -# pan_agg( -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean/CSWW_CSWWWorker_merged.csv", -# "BAD", -# "/workspaces/liia-tools/liiatools/datasets/social_work_workforce/lds_csww_clean", -# ) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py index 57acf184..752ccea5 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/cleaner.py @@ -71,13 +71,16 @@ def clean_numeric(event): """ numeric = event.schema_dict["numeric"] try: - if numeric == "integer": - clean_text = to_numeric(value=event.text, config=numeric) - elif numeric == "decimal": - decimal_places = int(event.schema_dict["decimal"]) - min_inclusive = event.schema_dict.get("min_inclusive", None) - max_inclusive = event.schema_dict.get("max_inclusive", None) - clean_text = to_numeric(value=event.text, config=numeric, decimal_places=decimal_places, min_inclusive=min_inclusive, max_inclusive=max_inclusive) + decimal_places = event.schema_dict.get("decimal", None) + min_inclusive = event.schema_dict.get("min_inclusive", None) + max_inclusive = event.schema_dict.get("max_inclusive", None) + clean_text = to_numeric( + value=event.text, + config=numeric, + decimal_places=decimal_places, + min_inclusive=min_inclusive, + max_inclusive=max_inclusive, + ) if clean_text != "error": return event.from_event(event, text=clean_text, formatting_error="0") return event.from_event(event, text="", formatting_error="1") @@ -111,7 +114,7 @@ def clean(stream): """ Compile the cleaning functions - :param event: A list of event objects + :param stream: A list of event objects :return: An updated list of event objects """ stream = clean_dates(stream) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py index 9c813a7d..4c895b04 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/converters.py @@ -19,7 +19,7 @@ def to_category(string, categories): return code["code"] if ( str(string).lower() == str(code["code"]).lower() + ".0" - ): # In case integers are read as floats + ): # In case integers are read as floats return code["code"] if "name" in code: if str(code["name"]).lower() in str(string).lower(): @@ -37,31 +37,25 @@ def to_numeric(value, config, decimal_places=0, min_inclusive=None, max_inclusiv :param value: Some value to convert to an integer or decimal :param config: The loaded configuration - :param dec_places: The number of decimal places to apply (default 0) + :param decimal_places: The number of decimal places to apply (default 0) :param min_inclusive: Minimum value allowed (default none) :param max_inclusive: Maximum value allowed (default none) - :return: Either an integer, a decimal value formatted to number of decimal places or an "error" string if value could not be formatted as decimal or a blank string if no value provided + :return: Either an integer, a decimal value formatted to number of decimal places or an "error" string if + value could not be formatted as decimal or a blank string if no value provided """ if config == "decimal": if value or value == 0: - try: - float(value) - except (ValueError, TypeError): - return "error" # value is not a float - round_to_dp = round(float(value), decimal_places) - if (min_inclusive is None or round_to_dp >= min_inclusive) and (max_inclusive is None or round_to_dp <= max_inclusive): + round_to_dp = round(float(value), int(decimal_places)) + if (min_inclusive is None or round_to_dp >= float(min_inclusive)) and ( + max_inclusive is None or round_to_dp <= float(max_inclusive) + ): return round_to_dp - return "error" # min/max error - return "" # no value provided + return "error" # min/max error + return "" # no value provided if config == "integer": - if value or value==0: - if isinstance(value, str) and value[-2:] == ".0": - return int(float(value)) - elif value or value == 0: - return int(value) - else: - return "error" # value incorrectly formatted - return "" # no value provided + if value or value == 0: + return int(float(value)) + return "" # no value provided else: return value @@ -72,13 +66,14 @@ def to_regex(value, pattern): :param value: Some value to convert to a regex string :param pattern: The regex pattern to compare - :return: Either a string matching the regex pattern or an "error" string if value does not match pattern or a blank string if no value provided + :return: Either a string matching the regex pattern or an "error" string if value does not match pattern or a + blank string if no value provided """ if value: stripped_value = value.strip() isfullmatch = re.fullmatch(pattern, stripped_value) if isfullmatch: return stripped_value - return "error" # value does not match regex pattern + return "error" # value does not match regex pattern else: - return "" # no value provided \ No newline at end of file + return "" # no value provided diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py index 85d058ff..b0adb5f4 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/csww_record.py @@ -34,7 +34,7 @@ def text_collector(stream): Create a dictionary of text values for each element :param stream: An iterator of events from an XML parser - :return: Dictionary containing element name and text values + :return: Dictionary containing element name and text values """ data_dict = {} current_element = None @@ -82,9 +82,15 @@ def message_collector(stream): "GenderCurrent", "Ethnicity", "QualInst", + "QualLevel", "StepUpGrad", + "OrgRole", "RoleStartDate", "StartOrigin", + "RoleEndDate", + "LeaverDestination", + "ReasonLeave", + "FTE30", "Cases30", "WorkingDaysLost", "ContractWeeks", diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py index 38da3243..c7aae5ac 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/file_creator.py @@ -15,13 +15,13 @@ def convert_to_dataframe(data): return data -def get_year(data, year): +def add_year(data, year): data["YEAR"] = year return data def convert_to_datetime(data): - if set(["PersonBirthDate", "RoleStartDate"]).issubset(data): + if {"PersonBirthDate", "RoleStartDate"}.issubset(data): data[["PersonBirthDate", "RoleStartDate"]] = data[ ["PersonBirthDate", "RoleStartDate"] ].apply(pd.to_datetime) @@ -48,7 +48,9 @@ def degrade_SWENo(data): """ if "SWENo" in data: if data["SWENo"] is not None: - data["SWENo"] = data["SWENo"].apply(lambda row: swe_hash(row) if row else row ) + data["SWENo"] = data["SWENo"].apply( + lambda row: swe_hash(row) if row else row + ) return data @@ -81,7 +83,7 @@ def add_fields(input_year, data, la_name): :return: Dataframe with year and LA added """ data = convert_to_dataframe(data) - data = get_year(data, input_year) + data = add_year(data, input_year) data = add_la_name(data, la_name) return data diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py index 3073a5c8..8b81520f 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/filters.py @@ -2,11 +2,9 @@ from typing import List import xml.etree.ElementTree as ET import xmlschema -from xmlschema import XMLSchemaValidatorError from sfdata_stream_parser.checks import type_check from sfdata_stream_parser import events -from sfdata_stream_parser.collectors import collector, block_check from sfdata_stream_parser.filters.generic import streamfilter, pass_event log = logging.getLogger(__name__) @@ -88,6 +86,14 @@ def _create_category_dict(field: str, file: str): def _create_float_dict(field: str, file: str): + """ + Create a dictionary containing the different float parameters of a given field to conform floats + e.g. {'numeric': 'decimal', 'fixed': 'true', 'decimal': '6', 'min_inclusive': '0', 'max_inclusive': '1'} + + :param field: Name of the float field you want to find the parameters for + :param file: Path to the .xsd schema containing possible float parameters + :return: Dictionary of float parameters + """ float_dict = None xsd_xml = ET.parse(file) @@ -173,7 +179,11 @@ def add_schema(event, schema: xmlschema.XMLSchema): return event.from_event(event, path=path, schema=el) -@streamfilter(check=type_check(events.TextNode), fail_function=pass_event) +@streamfilter( + check=type_check(events.TextNode), + fail_function=pass_event, + error_function=pass_event, +) def add_schema_dict(event, schema_path: str): """ Add a dictionary of schema attributes to an event object based on its type and occurrence @@ -206,76 +216,3 @@ def add_schema_dict(event, schema_path: str): schema_dict = {**schema_dict, **{"canbeblank": False}} return event.from_event(event, schema_dict=schema_dict) - - -def _get_validation_error(schema, node) -> XMLSchemaValidatorError: - try: - schema.validate(node) - return None - except XMLSchemaValidatorError as e: - return e - - -@streamfilter(check=type_check(events.StartElement), fail_function=pass_event) -def validate_elements(event): - """ - Validates each element, and if not valid, sets the properties: - - * valid - (always False) - * validation_message - a descriptive validation message - """ - validation_error = _get_validation_error(event.schema, event.node) - if validation_error is None: - return event - - message = ( - validation_error.reason - if hasattr(validation_error, "reason") - else validation_error.message - ) - return events.StartElement.from_event( - event, valid=False, validation_message=message - ) - - -@streamfilter(check=type_check(events.StartElement), fail_function=pass_event) -def prop_to_attribute(event, prop_name): - """ - Elevates an event property to an XML attribute. - """ - if hasattr(event, prop_name): - attrs = getattr(event, "attrs", {}) - attrs[prop_name] = getattr(event, prop_name) - return events.StartElement.from_event(event, attrs=attrs) - else: - return event - - -@collector(check=block_check(events.StartElement), receive_stream=True) -def remove_invalid(stream, tag_name): - """ - Filters out events with the given tag name if they are not valid - """ - stream = list(stream) - first = stream[0] - last = stream[-1] - stream = stream[1:-1] - - if first.tag == tag_name and not getattr(first, "valid", True): - yield from [] - else: - yield first - - if len(stream) > 0: - yield from remove_invalid(stream, tag_name=tag_name) - - yield last - - -@streamfilter(check=lambda x: True) -def counter(event, counter_check, context): - if counter_check(event): - context["pass"] += 1 - else: - context["fail"] += 1 - return event diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py index 8c7e715a..b27eb03a 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/logger.py @@ -31,10 +31,7 @@ def create_formatting_error_list(stream): formatting_error_list=formatting_error_list, ) formatting_error_list = None - elif ( - formatting_error_list is not None - and isinstance(event, events.TextNode) - ): + elif formatting_error_list is not None and isinstance(event, events.TextNode): try: if event.formatting_error == "1": formatting_error_list.append(event.schema.name) @@ -44,7 +41,9 @@ def create_formatting_error_list(stream): @streamfilter( - check=type_check(events.TextNode), fail_function=pass_event, error_function=pass_event + check=type_check(events.TextNode), + fail_function=pass_event, + error_function=pass_event, ) def blank_error_check(event): """ @@ -82,10 +81,7 @@ def create_blank_error_list(stream): elif isinstance(event, ErrorTable): yield ErrorTable.from_event(event, blank_error_list=blank_error_list) blank_error_list = None - elif ( - blank_error_list is not None - and isinstance(event, events.TextNode) - ): + elif blank_error_list is not None and isinstance(event, events.TextNode): try: if event.blank_error == "1": blank_error_list.append(event.schema.name) @@ -94,6 +90,27 @@ def create_blank_error_list(stream): yield event +def create_validation_error_list(stream): + """ + Create a list of the validation errors + + :param stream: A filtered list of event objects + :return: An updated list of event objects + """ + validation_error_list = [] + for event in stream: + if isinstance(event, ErrorTable): + yield ErrorTable.from_event( + event, validation_error_list=validation_error_list + ) + validation_error_list = None + elif isinstance(event, events.StartElement): + validation_message = getattr(event, "validation_message", None) + if validation_message is not None: + validation_error_list.append(validation_message) + yield event + + def save_errors_la(stream, la_log_dir, filename): """ Count the error events and save them as a text file in the Local Authority Logs directory @@ -110,8 +127,13 @@ def save_errors_la(stream, la_log_dir, filename): if isinstance(event, ErrorTable) and ( event.formatting_error_list is not None and event.blank_error_list is not None + and event.validation_error_list is not None ): - if event.formatting_error_list or event.blank_error_list: + if ( + event.formatting_error_list + or event.blank_error_list + or event.validation_error_list + ): with open( f"{os.path.join(la_log_dir, filename)}_error_log_{start_time}.txt", "a", @@ -138,6 +160,14 @@ def save_errors_la(stream, la_log_dir, filename): str(blank_counter_dict)[9:-2] ) # Remove "Counter({" and "})" from string f.write("\n") + if event.validation_error_list: + event.validation_error_list = list( + dict.fromkeys(event.validation_error_list) + ) # Remove duplicate information from list but + # keep order + for item in event.validation_error_list: + f.write(item) + f.write("\n") except AttributeError: pass @@ -154,4 +184,5 @@ def log_errors(stream): stream = blank_error_check(stream) stream = create_formatting_error_list(stream) stream = create_blank_error_list(stream) + stream = create_validation_error_list(stream) return stream diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py index c0905a73..99baeae0 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/schema.py @@ -12,7 +12,10 @@ def __init__(self, year): @cached_property def path(self): - return Path(social_work_workforce_dir.__file__).parent / f"social_work_workforce_{self.__year}.xsd" + return ( + Path(social_work_workforce_dir.__file__).parent + / f"social_work_workforce_{self.__year}.xsd" + ) class Schema: diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/validator.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/validator.py new file mode 100644 index 00000000..c2177a71 --- /dev/null +++ b/liiatools/datasets/social_work_workforce/lds_csww_clean/validator.py @@ -0,0 +1,63 @@ +import logging +import re + +from sfdata_stream_parser.checks import type_check +from sfdata_stream_parser import events +from sfdata_stream_parser.filters.generic import streamfilter, pass_event + +log = logging.getLogger(__name__) + + +def _get_validation_error(event, schema, node): + """ + Validate an event + + :param event: A filtered list of event objects + :param schema: The xml schema attached to a given event + :param node: The node attached to a given event + :return: Event and error information + """ + try: + validation_error_iterator = schema.iter_errors(node) + for validation_error in validation_error_iterator: + if " expected" in validation_error.reason: + + reg_line = re.compile( + r"(?=\(line.*?(\w+))", re.MULTILINE + ) # Search for the number after "line" in error + missing_field_line = reg_line.search(str(validation_error)).group(1) + + reg_exp = re.compile( + r"(?=\sTag.*?(\w+))" + ) # Search for the first word after "Tag" + missing_field = reg_exp.search(validation_error.reason).group(1) + + errors = ( + f"Missing required field: '{missing_field}' which occurs in the node starting on " + f"line: {missing_field_line}" + ) + + return event.from_event(event, reason=errors) + + except AttributeError: # Raised for nodes that don't exist in the schema + reason = f"Unexpected node '{event.tag}'" + return event.from_event(event, reason=reason) + + +@streamfilter(check=type_check(events.StartElement), fail_function=pass_event) +def validate_elements(event): + """ + Validates each element, and if not valid, sets the properties: + + :param event: A filtered list of event objects + + * valid - (always False) + * validation_message - a descriptive validation message + """ + validation_error = _get_validation_error(event, event.schema, event.node) + + if validation_error is None: + return event + + message = validation_error.reason + return event.from_event(event, valid=False, validation_message=message) diff --git a/liiatools/datasets/social_work_workforce/lds_csww_clean/xml.py b/liiatools/datasets/social_work_workforce/lds_csww_clean/xml.py deleted file mode 100644 index 58bc03fc..00000000 --- a/liiatools/datasets/social_work_workforce/lds_csww_clean/xml.py +++ /dev/null @@ -1,49 +0,0 @@ -from sfdata_stream_parser.events import ( - StartElement, - EndElement, - TextNode, - CommentNode, - ProcessingInstructionNode, -) - -try: - from lxml import etree -except ImportError: - pass - - -def dom_parse(source, **kwargs): - """ - Equivalent of the xml parse included in the sfdata_stream_parser package, but uses the ET DOM - and allows direct DOM manipulation. - """ - parser = etree.iterparse(source, events=("start", "end", "comment", "pi"), **kwargs) - for action, elem in parser: - if action == "start": - yield StartElement(tag=elem.tag, attrib=elem.attrib, node=elem) - yield TextNode(text=elem.text) - elif action == "end": - yield EndElement(tag=elem.tag, node=elem) - if elem.tail: - yield TextNode(text=elem.tail) - elif action == "comment": - yield CommentNode(text=elem.text, node=elem) - elif action == "pi": - yield ProcessingInstructionNode(name=elem.target, text=elem.text, node=elem) - else: - raise ValueError(f"Unknown event: {action}") - - -def to_xml(stream, builder: etree.TreeBuilder): - for ev in stream: - if isinstance(ev, StartElement): - builder.start(ev.tag, getattr(ev, "attrs", {})) - elif isinstance(ev, EndElement): - builder.end(ev.tag) - elif isinstance(ev, TextNode): - builder.data(ev.text) - elif isinstance(ev, CommentNode): - builder.comment(ev.text) - elif isinstance(ev, ProcessingInstructionNode): - builder.pi(ev.name, ev.text) - yield ev diff --git a/liiatools/datasets/social_work_workforce/lds_csww_data_generator/stream.py b/liiatools/datasets/social_work_workforce/lds_csww_data_generator/stream.py index 45fc26d6..13c1bf56 100644 --- a/liiatools/datasets/social_work_workforce/lds_csww_data_generator/stream.py +++ b/liiatools/datasets/social_work_workforce/lds_csww_data_generator/stream.py @@ -3,7 +3,7 @@ def consume(stream) -> Counter: """ - Ensures the stream is consumed and returns a summary of the numbers of each event that has been encountered + Ensures the stream is consumed and returns a summary of the numbers of each event that has been encountered """ stream_types = [type(ev) for ev in stream] return Counter(stream_types) diff --git a/liiatools/spec/social_work_workforce/la-agg.yml b/liiatools/spec/social_work_workforce/la-agg.yml index 3d64fe54..781ebccd 100644 --- a/liiatools/spec/social_work_workforce/la-agg.yml +++ b/liiatools/spec/social_work_workforce/la-agg.yml @@ -7,9 +7,15 @@ column_names: - GenderCurrent - Ethnicity - QualInst + - QualLevel - StepUpGrad + - OrgRole - RoleStartDate - StartOrigin + - RoleEndDate + - LeaverDestination + - ReasonLeave + - FTE30 - Cases30 - WorkingDaysLost - ContractWeeks diff --git a/liiatools/spec/social_work_workforce/pan-agg.yml b/liiatools/spec/social_work_workforce/pan-agg.yml index ecf3b9b6..f74bdf34 100644 --- a/liiatools/spec/social_work_workforce/pan-agg.yml +++ b/liiatools/spec/social_work_workforce/pan-agg.yml @@ -7,9 +7,15 @@ column_names: - GenderCurrent - Ethnicity - QualInst + - QualLevel - StepUpGrad + - OrgRole - RoleStartDate - StartOrigin + - RoleEndDate + - LeaverDestination + - ReasonLeave + - FTE30 - Cases30 - WorkingDaysLost - ContractWeeks diff --git a/tests/cin_census/test_converter.py b/tests/cin_census/test_converter.py index 74cd2eb5..e6f044db 100644 --- a/tests/cin_census/test_converter.py +++ b/tests/cin_census/test_converter.py @@ -15,7 +15,7 @@ def __init__(self): [ events.TextNode(text="false", schema=Schema()), events.TextNode(text="true", schema=Schema()), - events.TextNode(text="TRUE", schema=Schema()) + events.TextNode(text="TRUE", schema=Schema()), ] ) stream = list(stream) @@ -38,5 +38,3 @@ def __init__(self): assert stream[0].text == "false" assert stream[1].text == "true" assert stream[2].text == "true" - - diff --git a/tests/cin_census/test_file_creator.py b/tests/cin_census/test_file_creator.py index 0e8ac8a3..cc7d259d 100644 --- a/tests/cin_census/test_file_creator.py +++ b/tests/cin_census/test_file_creator.py @@ -2,7 +2,6 @@ import pandas as pd from datetime import datetime -import tempfile as tmp def test_get_year(): diff --git a/tests/cin_census/test_schema.py b/tests/cin_census/test_schema.py index 8a5a48fc..f19bd3c5 100644 --- a/tests/cin_census/test_schema.py +++ b/tests/cin_census/test_schema.py @@ -3,8 +3,8 @@ def test_schema(): - schema=Schema(2022).schema + schema = Schema(2022).schema assert schema.name == "CIN_schema_2022.xsd" - schema=Schema(2017).schema + schema = Schema(2017).schema assert schema.name == "CIN_schema_2017.xsd" diff --git a/tests/common/test_common.py b/tests/common/test_common.py index b1d7f0f5..318ac61f 100644 --- a/tests/common/test_common.py +++ b/tests/common/test_common.py @@ -2,7 +2,7 @@ check_postcode, flip_dict, check_year, - check_year_within_range + check_year_within_range, ) from liiatools.datasets.shared_functions.converters import ( to_short_postcode, @@ -65,11 +65,11 @@ def test_check_year(): def test_check_year_within_range(): - assert check_year_within_range(2016, 6, 6, datetime.datetime(2023,5,31)) is False - assert check_year_within_range(2023, 6, 6, datetime.datetime(2023,5,31)) is True - assert check_year_within_range(2024, 6, 6, datetime.datetime(2023,5,31)) is False - assert check_year_within_range(2024, 6, 6, datetime.datetime(2023,6,1)) is True - assert check_year_within_range(2013, 10, 2, datetime.datetime(2023,1,31)) is True + assert check_year_within_range(2016, 6, 6, datetime.datetime(2023, 5, 31)) is False + assert check_year_within_range(2023, 6, 6, datetime.datetime(2023, 5, 31)) is True + assert check_year_within_range(2024, 6, 6, datetime.datetime(2023, 5, 31)) is False + assert check_year_within_range(2024, 6, 6, datetime.datetime(2023, 6, 1)) is True + assert check_year_within_range(2013, 10, 2, datetime.datetime(2023, 1, 31)) is True class TestCheckYear(unittest.TestCase): diff --git a/tests/s903/test_file_creator.py b/tests/s903/test_file_creator.py index fb224795..c081b740 100644 --- a/tests/s903/test_file_creator.py +++ b/tests/s903/test_file_creator.py @@ -10,30 +10,54 @@ def test_coalesce_row(): stream = ( - events.StartRow(expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell="value_one", header="Header_1", expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell="value_two", header="Header_2", expected_columns = ["Header_1", "Header_2"]), - events.EndRow(expected_columns = ["Header_1", "Header_2"]), + events.StartRow(expected_columns=["Header_1", "Header_2"]), + events.Cell( + cell="value_one", + header="Header_1", + expected_columns=["Header_1", "Header_2"], + ), + events.Cell( + cell="value_two", + header="Header_2", + expected_columns=["Header_1", "Header_2"], + ), + events.EndRow(expected_columns=["Header_1", "Header_2"]), ) events_complete_rows = list(file_creator.coalesce_row(stream))[0] assert events_complete_rows.row == ["value_one", "value_two"] stream = ( - events.StartRow(expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell=125, header="Header_1", expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell=341, header="Header_2", expected_columns = ["Header_1", "Header_2"]), - events.EndRow(year=2019, expected_columns = ["Header_1", "Header_2"]), + events.StartRow(expected_columns=["Header_1", "Header_2"]), + events.Cell( + cell=125, header="Header_1", expected_columns=["Header_1", "Header_2"] + ), + events.Cell( + cell=341, header="Header_2", expected_columns=["Header_1", "Header_2"] + ), + events.EndRow(year=2019, expected_columns=["Header_1", "Header_2"]), ) events_complete_rows = list(file_creator.coalesce_row(stream))[0] assert events_complete_rows.row == [125, 341] assert events_complete_rows.year == 2019 stream = ( - events.StartRow(expected_columns = ["Header_1", "Header_2", "Header_3"]), - events.Cell(cell=125, header="Header_1", expected_columns = ["Header_1", "Header_2", "Header_3"]), - events.Cell(cell="string", header="Header_2", expected_columns = ["Header_1", "Header_2", "Header_3"]), - events.Cell(cell=datetime(2020, 3, 23), header="Header_3", expected_columns = ["Header_1", "Header_2", "Header_3"]), - events.EndRow(expected_columns = ["Header_1", "Header_2", "Header_3"]), + events.StartRow(expected_columns=["Header_1", "Header_2", "Header_3"]), + events.Cell( + cell=125, + header="Header_1", + expected_columns=["Header_1", "Header_2", "Header_3"], + ), + events.Cell( + cell="string", + header="Header_2", + expected_columns=["Header_1", "Header_2", "Header_3"], + ), + events.Cell( + cell=datetime(2020, 3, 23), + header="Header_3", + expected_columns=["Header_1", "Header_2", "Header_3"], + ), + events.EndRow(expected_columns=["Header_1", "Header_2", "Header_3"]), ) events_complete_rows = list(file_creator.coalesce_row(stream))[0] assert events_complete_rows.row == [ @@ -43,37 +67,53 @@ def test_coalesce_row(): ] stream = ( - events.StartRow(expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell=125, header="Header_1", expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell=None, header="Header_2", expected_columns = ["Header_1", "Header_2"]), - events.EndRow(expected_columns = ["Header_1", "Header_2"]), + events.StartRow(expected_columns=["Header_1", "Header_2"]), + events.Cell( + cell=125, header="Header_1", expected_columns=["Header_1", "Header_2"] + ), + events.Cell( + cell=None, header="Header_2", expected_columns=["Header_1", "Header_2"] + ), + events.EndRow(expected_columns=["Header_1", "Header_2"]), ) events_complete_rows = list(file_creator.coalesce_row(stream))[0] assert events_complete_rows.row == [125, None] stream = ( - events.StartRow(expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell=125, header="Header_1", expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell="", header="Header_2", expected_columns = ["Header_1", "Header_2"]), - events.EndRow(expected_columns = ["Header_1", "Header_2"]), + events.StartRow(expected_columns=["Header_1", "Header_2"]), + events.Cell( + cell=125, header="Header_1", expected_columns=["Header_1", "Header_2"] + ), + events.Cell( + cell="", header="Header_2", expected_columns=["Header_1", "Header_2"] + ), + events.EndRow(expected_columns=["Header_1", "Header_2"]), ) events_complete_rows = list(file_creator.coalesce_row(stream))[0] assert events_complete_rows.row == [125, ""] stream = ( - events.StartTable(expected_columns = ["Header_1", "Header_2"]), - events.StartRow(expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell="value_one", header="Header_1", expected_columns = ["Header_1", "Header_2"]), - events.Cell(cell="value_two", header="Header_2", expected_columns = ["Header_1", "Header_2"]), - events.EndRow(expected_columns = ["Header_1", "Header_2"]), - events.EndTable(expected_columns = ["Header_1", "Header_2"]), + events.StartTable(expected_columns=["Header_1", "Header_2"]), + events.StartRow(expected_columns=["Header_1", "Header_2"]), + events.Cell( + cell="value_one", + header="Header_1", + expected_columns=["Header_1", "Header_2"], + ), + events.Cell( + cell="value_two", + header="Header_2", + expected_columns=["Header_1", "Header_2"], + ), + events.EndRow(expected_columns=["Header_1", "Header_2"]), + events.EndTable(expected_columns=["Header_1", "Header_2"]), ) events_complete_rows = list(file_creator.coalesce_row(stream)) for event in events_complete_rows: if isinstance(event, file_creator.RowEvent): assert event.row == ["value_one", "value_two"] else: - assert event.as_dict() == {"expected_columns":["Header_1", "Header_2"]} + assert event.as_dict() == {"expected_columns": ["Header_1", "Header_2"]} def test_create_tables(): @@ -97,7 +137,11 @@ def test_create_tables(): assert event.data[0] == data[0] stream = ( - events.StartTable(headers=["CHILD ID", "DOB"], expected_columns=expected_columns, match_error="some_error"), + events.StartTable( + headers=["CHILD ID", "DOB"], + expected_columns=expected_columns, + match_error="some_error", + ), file_creator.RowEvent(row=[12345, datetime(2019, 4, 15).date()], year=2019), events.EndTable(), ) diff --git a/tests/s903/test_populate.py b/tests/s903/test_populate.py index d7892945..bd910f16 100644 --- a/tests/s903/test_populate.py +++ b/tests/s903/test_populate.py @@ -12,13 +12,13 @@ def test_add_year_column(): events.EndRow(), events.EndTable(), ], - year = "2022" + year="2022", ) stream = list(stream) assert stream[0].year == "2022" assert stream[1].year == "2022" assert not hasattr(stream[2], "year") - + def test_create_la_child_id(): stream = populate.create_la_child_id( diff --git a/tests/social_work_workforce/test_cleaner.py b/tests/social_work_workforce/test_cleaner.py index 8bd6c22d..7f6cd54c 100644 --- a/tests/social_work_workforce/test_cleaner.py +++ b/tests/social_work_workforce/test_cleaner.py @@ -4,7 +4,9 @@ def test_clean_dates(): - event = events.TextNode(text=datetime(2019, 1, 15), schema_dict={"date": "%d/%m/%Y"}) + event = events.TextNode( + text=datetime(2019, 1, 15), schema_dict={"date": "%d/%m/%Y"} + ) cleaned_event = list(cleaner.clean_dates(event))[0] assert cleaned_event.text == datetime(2019, 1, 15).date() assert cleaned_event.formatting_error == "0" @@ -145,7 +147,7 @@ def test_clean_numeric(): assert cleaned_event.text == "" assert cleaned_event.formatting_error == "0" - event = events.TextNode(text="123", schema_dict={"numeric": "integer"}) + event = events.TextNode(text="123.0", schema_dict={"numeric": "integer"}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123 assert cleaned_event.formatting_error == "0" @@ -155,7 +157,9 @@ def test_clean_numeric(): assert cleaned_event.text == "" assert cleaned_event.formatting_error == "1" - event = events.TextNode(text=datetime(2017, 3, 17), schema_dict={"numeric": "integer"}) + event = events.TextNode( + text=datetime(2017, 3, 17), schema_dict={"numeric": "integer"} + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" assert cleaned_event.formatting_error == "1" @@ -166,26 +170,34 @@ def test_clean_numeric(): cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == datetime(2017, 3, 17) - event = events.TextNode(text=123.45, schema_dict={"numeric": "decimal", "decimal": 2}) + event = events.TextNode( + text=123.45, schema_dict={"numeric": "decimal", "decimal": 2} + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123.45 assert cleaned_event.formatting_error == "0" - - event = events.TextNode(text=123.4567, schema_dict={"numeric": "decimal", "decimal": 2}) + + event = events.TextNode( + text=123.4567, schema_dict={"numeric": "decimal", "decimal": 2} + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123.46 assert cleaned_event.formatting_error == "0" - event = events.TextNode(text=123.45, schema_dict={"numeric": "decimal", "decimal": 0}) + event = events.TextNode( + text=123.45, schema_dict={"numeric": "decimal", "decimal": 0} + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123 assert cleaned_event.formatting_error == "0" - event = events.TextNode(text=123.456, schema_dict={"numeric": "decimal", "decimal": 6}) + event = events.TextNode( + text=123.456, schema_dict={"numeric": "decimal", "decimal": 6} + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123.456 assert cleaned_event.formatting_error == "0" - + event = events.TextNode(text="", schema_dict={"numeric": "decimal", "decimal": 2}) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" @@ -196,35 +208,72 @@ def test_clean_numeric(): assert cleaned_event.text == "" assert cleaned_event.formatting_error == "0" - event = events.TextNode(text="123.4567", schema_dict={"numeric": "decimal", "decimal": 2}) + event = events.TextNode( + text="123.4567", schema_dict={"numeric": "decimal", "decimal": 2} + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 123.46 assert cleaned_event.formatting_error == "0" - event = events.TextNode(text="0.45", schema_dict={"numeric": "decimal", "decimal": 2, "min_inclusive": 0, "max_inclusive": 1}) + event = events.TextNode( + text="string", schema_dict={"numeric": "decimal", "decimal": 2} + ) + cleaned_event = list(cleaner.clean_numeric(event))[0] + assert cleaned_event.text == "" + assert cleaned_event.formatting_error == "1" + + event = events.TextNode( + text="0.45", + schema_dict={ + "numeric": "decimal", + "decimal": 2, + "min_inclusive": 0, + "max_inclusive": 1, + }, + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == 0.45 assert cleaned_event.formatting_error == "0" - event = events.TextNode(text="1.99", schema_dict={"numeric": "decimal", "decimal": 2, "min_inclusive": 0, "max_inclusive": 1}) + event = events.TextNode( + text="1.99", + schema_dict={ + "numeric": "decimal", + "decimal": 2, + "min_inclusive": 0, + "max_inclusive": 1, + }, + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.formatting_error == "1" # exceeds maximum value + assert cleaned_event.formatting_error == "1" # exceeds maximum value - event = events.TextNode(text="0.50", schema_dict={"numeric": "decimal", "decimal": 2, "min_inclusive": 1, "max_inclusive": 9}) + event = events.TextNode( + text="0.50", + schema_dict={ + "numeric": "decimal", + "decimal": 2, + "min_inclusive": 1, + "max_inclusive": 9, + }, + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.formatting_error == "1" # less than minimum value + assert cleaned_event.formatting_error == "1" # less than minimum value - event = events.TextNode(text="string", schema_dict={"numeric": "decimal", "decimal": 2}) + event = events.TextNode( + text="string", schema_dict={"numeric": "decimal", "decimal": 2} + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.formatting_error == "1" # not a decimal + assert cleaned_event.formatting_error == "1" - event = events.TextNode(text=datetime(2017, 3, 17), schema_dict={"numeric": "decimal", "decimal": 2}) + event = events.TextNode( + text=datetime(2017, 3, 17), schema_dict={"numeric": "decimal", "decimal": 2} + ) cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == "" - assert cleaned_event.formatting_error == "1" # not a decimal + assert cleaned_event.formatting_error == "1" event = events.TextNode( text=datetime(2017, 3, 17), schema_dict={"not_numeric": "decimal"} @@ -232,8 +281,11 @@ def test_clean_numeric(): cleaned_event = list(cleaner.clean_numeric(event))[0] assert cleaned_event.text == datetime(2017, 3, 17) + def test_clean_regex_string(): - event = events.TextNode(text="AB1234567890", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + event = events.TextNode( + text="AB1234567890", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"} + ) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "AB1234567890" assert cleaned_event.formatting_error == "0" @@ -243,38 +295,44 @@ def test_clean_regex_string(): assert cleaned_event.text == "" assert cleaned_event.formatting_error == "0" - event = events.TextNode(text=None, schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + event = events.TextNode( + text=None, schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"} + ) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "" assert cleaned_event.formatting_error == "0" - event = events.TextNode(text=" AB1234567890 ", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + event = events.TextNode( + text=" AB1234567890 ", + schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}, + ) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "AB1234567890" assert cleaned_event.formatting_error == "0" - event = events.TextNode(text="AB123456", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + event = events.TextNode( + text="AB123456", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"} + ) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "" assert cleaned_event.formatting_error == "1" - event = events.TextNode(text="AB1234567890123456", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + event = events.TextNode( + text="AB1234567890123456", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"} + ) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "" assert cleaned_event.formatting_error == "1" - event = events.TextNode(text="AB12345 67890", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"}) + event = events.TextNode( + text="AB12345 67890", schema_dict={"regex_string": r"[A-Za-z]{2}\d{10}"} + ) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "" assert cleaned_event.formatting_error == "1" - event = events.TextNode(text="string", schema_dict={"not_regex_string": r"[A-Za-z]{2}\d{10}"}) + event = events.TextNode( + text="string", schema_dict={"not_regex_string": r"[A-Za-z]{2}\d{10}"} + ) cleaned_event = list(cleaner.clean_regex_string(event))[0] assert cleaned_event.text == "string" - - -# test_clean_dates() -# test_clean_categories() -# test_clean_numeric() -# test_clean_regex_string() - diff --git a/tests/social_work_workforce/test_converters.py b/tests/social_work_workforce/test_converters.py index a4526ebe..203d095e 100644 --- a/tests/social_work_workforce/test_converters.py +++ b/tests/social_work_workforce/test_converters.py @@ -37,11 +37,30 @@ def test_to_numeric(): assert converters.to_numeric("date", "") == "date" assert converters.to_numeric("", "decimal", decimal_places) == "" assert converters.to_numeric(None, "decimal", decimal_places) == "" - assert converters.to_numeric("0.3", "decimal", decimal_places, min_inclusive=0, max_inclusive=1) == 0.3 - assert converters.to_numeric("0.3", "decimal", decimal_places, min_inclusive=0) == 0.3 - assert converters.to_numeric("0.3", "decimal", decimal_places, max_inclusive=1) == 0.3 - assert converters.to_numeric("1.99", "decimal", decimal_places, min_inclusive=0, max_inclusive=1) == "error" - assert converters.to_numeric("0.3", "decimal", decimal_places, min_inclusive=1, max_inclusive=99) == "error" + assert ( + converters.to_numeric( + "0.3", "decimal", decimal_places, min_inclusive=0, max_inclusive=1 + ) + == 0.3 + ) + assert ( + converters.to_numeric("0.3", "decimal", decimal_places, min_inclusive=0) == 0.3 + ) + assert ( + converters.to_numeric("0.3", "decimal", decimal_places, max_inclusive=1) == 0.3 + ) + assert ( + converters.to_numeric( + "1.99", "decimal", decimal_places, min_inclusive=0, max_inclusive=1 + ) + == "error" + ) + assert ( + converters.to_numeric( + "0.3", "decimal", decimal_places, min_inclusive=1, max_inclusive=99 + ) + == "error" + ) assert converters.to_numeric("3000", "integer") == 3000 assert converters.to_numeric(123, "integer") == 123 assert converters.to_numeric("1.0", "integer") == 1 @@ -53,17 +72,11 @@ def test_to_numeric(): def test_to_regex(): - pattern=r"[A-Za-z]{2}\d{10}" - assert converters.to_regex("AB1234567890",pattern) == "AB1234567890" # match - assert converters.to_regex(" AB1234567890 ",pattern) == "AB1234567890" # match - assert converters.to_regex("AB1234567890123456",pattern) == "error" # too long - assert converters.to_regex("AB12345",pattern) == "error" # too short - assert converters.to_regex("xxxxOz2054309383",pattern) == "error" # invalid format - assert converters.to_regex("", pattern) == "" # no value - assert converters.to_regex(None, pattern) == "" # no value - - -# test_to_category() -# test_to_integer() -# test_to_decimal() -# test_to_regex() + pattern = r"[A-Za-z]{2}\d{10}" + assert converters.to_regex("AB1234567890", pattern) == "AB1234567890" # match + assert converters.to_regex(" AB1234567890 ", pattern) == "AB1234567890" # match + assert converters.to_regex("AB1234567890123456", pattern) == "error" # too long + assert converters.to_regex("AB12345", pattern) == "error" # too short + assert converters.to_regex("xxxxOz2054309383", pattern) == "error" # invalid format + assert converters.to_regex("", pattern) == "" # no value + assert converters.to_regex(None, pattern) == "" # no value diff --git a/tests/social_work_workforce/test_csww_record.py b/tests/social_work_workforce/test_csww_record.py index 88cd3890..3b25973d 100644 --- a/tests/social_work_workforce/test_csww_record.py +++ b/tests/social_work_workforce/test_csww_record.py @@ -1,4 +1,3 @@ -# Import the unittest module and the code to be tested import unittest from sfdata_stream_parser.events import StartElement, EndElement, TextNode from liiatools.datasets.social_work_workforce.lds_csww_clean.csww_record import ( @@ -8,7 +7,6 @@ LALevelEvent, HeaderEvent, ) -from liiatools.datasets.social_work_workforce.lds_csww_clean.xml import dom_parse class TestRecord(unittest.TestCase): @@ -73,8 +71,3 @@ def test_message_collector(self): self.assertEqual( test_events[2].record, {"ID": "100", "SWENo": "AB123456789", "Agency": "0"} ) - - -# Run the tests -if __name__ == "__main__": - unittest.main() diff --git a/tests/social_work_workforce/test_logger.py b/tests/social_work_workforce/test_logger.py index ad00a80b..233d0863 100644 --- a/tests/social_work_workforce/test_logger.py +++ b/tests/social_work_workforce/test_logger.py @@ -1,8 +1,3 @@ -import tempfile as tmp -from unittest.mock import patch -from pathlib import Path -from datetime import datetime - from liiatools.datasets.social_work_workforce.lds_csww_clean import logger from sfdata_stream_parser import events @@ -62,11 +57,21 @@ def test_create_formatting_error_list(): def test_blank_error_check(): mock_stream = logger.blank_error_check( [ - events.TextNode(schema_dict={"canbeblank": False}, text="", formatting_error="0"), - events.TextNode(schema_dict={"canbeblank": False}, text=None, formatting_error="0"), - events.TextNode(schema_dict={"canbeblank": False}, text="", formatting_error="1"), - events.TextNode(schema_dict={"canbeblank": False}, text="string", formatting_error="0"), - events.TextNode(schema_dict={"canbeblank": True}, text="", formatting_error="0"), + events.TextNode( + schema_dict={"canbeblank": False}, text="", formatting_error="0" + ), + events.TextNode( + schema_dict={"canbeblank": False}, text=None, formatting_error="0" + ), + events.TextNode( + schema_dict={"canbeblank": False}, text="", formatting_error="1" + ), + events.TextNode( + schema_dict={"canbeblank": False}, text="string", formatting_error="0" + ), + events.TextNode( + schema_dict={"canbeblank": True}, text="", formatting_error="0" + ), ] ) stream = list(mock_stream) @@ -89,13 +94,25 @@ def test_create_blank_error_list(): events_with_blank_error_list = list(logger.create_blank_error_list(mock_stream)) for event in events_with_blank_error_list: if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: - print(event.blank_error_list) assert event.blank_error_list == [ "some_header", "some_header_2", ] -test_create_formatting_error_list() -test_blank_error_check() -test_create_blank_error_list() +def test_create_validation_error_list(): + mock_stream = ( + events.StartElement(tag="LALevelVacancies"), + events.TextNode(text="some_header", validation_error_list="error_message"), + events.TextNode(text="some_header_2", validation_error_list="error_message_2"), + events.TextNode(text="some_header_3", validation_error_list=""), + events.TextNode(text="some_header_4"), + events.EndElement(tag="Message"), + ) + events_with_validation_error_list = list(logger.create_validation_error_list(mock_stream)) + for event in events_with_validation_error_list: + if isinstance(event, logger.ErrorTable) and event.as_dict() != {}: + assert event.validation_error_list == [ + "error_message", + "error_message_2", + ] From d357595558ef229839e3949556258d2f7e7eed69 Mon Sep 17 00:00:00 2001 From: patrick-troy <58770937+patrick-troy@users.noreply.github.com> Date: Wed, 8 Nov 2023 14:12:37 +0200 Subject: [PATCH 39/40] update parameters for save_incorrect_year_error call --- liiatools/datasets/social_work_workforce/csww_cli.py | 2 +- liiatools/datasets/social_work_workforce/csww_main_functions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/liiatools/datasets/social_work_workforce/csww_cli.py b/liiatools/datasets/social_work_workforce/csww_cli.py index 486e132c..e50db088 100644 --- a/liiatools/datasets/social_work_workforce/csww_cli.py +++ b/liiatools/datasets/social_work_workforce/csww_cli.py @@ -20,7 +20,7 @@ @click.group() def csww(): """ - Functions for creating CSWW Census sample file generator + Functions for cleaning, minimising and aggregating CSWW files """ pass diff --git a/liiatools/datasets/social_work_workforce/csww_main_functions.py b/liiatools/datasets/social_work_workforce/csww_main_functions.py index 6b88566f..b677fbca 100644 --- a/liiatools/datasets/social_work_workforce/csww_main_functions.py +++ b/liiatools/datasets/social_work_workforce/csww_main_functions.py @@ -126,7 +126,7 @@ def cleanfile(input, la_code, la_log_dir, output): ) is False ): - save_incorrect_year_error(input, la_log_dir) + save_incorrect_year_error(input, la_log_dir, retention_period=YEARS_TO_GO_BACK-1) return # Configure stream From 528ee0c59cd2609f448a029c783fa3bbd03c162e Mon Sep 17 00:00:00 2001 From: patrick-troy <58770937+patrick-troy@users.noreply.github.com> Date: Wed, 8 Nov 2023 14:33:28 +0200 Subject: [PATCH 40/40] remove unnecessary folders --- .../csww/BAD/social_work_workforce_2021.xml | 556 ------------------ .../csww/NEW/social_work_workforce_2022.xml | 556 ------------------ .../samples/flatfiles/BAD/la_log/blank.txt | 0 .../samples/log_files/blank.txt | 0 .../samples/outputs/blank.txt | 0 .../samples/request/blank.txt | 0 .../BAD => }/social_work_workforce_2022.xml | 0 .../social_work_workforce_2021.xsd | 254 -------- 8 files changed, 1366 deletions(-) delete mode 100644 liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml delete mode 100644 liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml delete mode 100644 liiatools/spec/social_work_workforce/samples/flatfiles/BAD/la_log/blank.txt delete mode 100644 liiatools/spec/social_work_workforce/samples/log_files/blank.txt delete mode 100644 liiatools/spec/social_work_workforce/samples/outputs/blank.txt delete mode 100644 liiatools/spec/social_work_workforce/samples/request/blank.txt rename liiatools/spec/social_work_workforce/samples/{csww/BAD => }/social_work_workforce_2022.xml (100%) delete mode 100644 liiatools/spec/social_work_workforce/social_work_workforce_2021.xsd diff --git a/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml b/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml deleted file mode 100644 index 8ef1b9ef..00000000 --- a/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2021.xml +++ /dev/null @@ -1,556 +0,0 @@ - -
- - CSWW - 2021 - 2021-09-30 - - - L - 301 - liiatools.datasets.social_work_workforce.sample_data - 2023-03-28T14:54:55Z - -
- - 66.66 - 40.40 - 100 - - - 1 - Ox2054309383 - 0.521371 - 1969-01-22 - 1 - REFU - Institution Name - 0 - 1988-04-07 - 9 - 72 - 15.31 - 288.7 - 1 - 0 - TRN - 1 - - - 1 - Yk7226043359 - 0 - 1958-04-07 - 9 - Institution Name - 1 - 2 - 8 - 2019-09-23 - 7 - 10 - 0.603665 - 66 - 29.87 - 2.5 - 1 - 4 - - - 1 - iP8098309864 - 0 - 1984-01-12 - APKN - Institution Name - 0 - 5 - 2014-01-26 - 4 - 2023-03-28 - 3 - 9 - 0.23246 - 92.56 - 213.4 - 0 - 2 - - - 0 - oP8178849586 - 0.899676 - 1990-09-28 - 9 - BAFR - 1 - 0 - 6 - 2023-03-28 - 2 - 0.429963 - 14.39 - 0 - - - 1 - nH9419631053 - 0.133587 - 2 - AIND - Institution Name - 3 - 1 - 2017-06-10 - 0.436348 - 5.39 - 475.7 - 1 - UNP - 1 - - - 1 - JJ3661684122 - 0 - 1993-05-19 - 9 - 3 - 2020-06-14 - 3 - 2023-03-28 - 1 - 5 - 0.903669 - 11 - 141.0 - 1 - - - 1 - tN2120744892 - 0.803122 - WBRI - Institution Name - 3 - 0.964327 - 95.06 - 403.6 - 0 - 2 - - - 1 - Zo9779760045 - 0.767688 - 1996-08-31 - 0 - MWAS - 2 - 1 - 5 - 2023-03-28 - 62 - 1 - 2 - - - 0 - wf3752370095 - 0.843488 - 1959-04-17 - 2 - APKN - Institution Name - 3 - 0 - 1997-10-01 - 2 - 0.712824 - 16.74 - 456.3 - 0 - 4 - - - 1 - OW2475789301 - 0 - 1971-10-02 - Institution Name - 1 - 1 - 1993-10-04 - 2023-03-28 - 3 - 6 - 0.908092 - 45 - 22.98 - 441.5 - 0 - 3 - - - 1 - Kv3016593719 - 0.12232 - 1996-06-05 - 1 - BAFR - 2 - 6 - 10 - 0.641824 - 23 - 36.13 - 213.1 - 0 - - - 0 - TB9669555723 - 0 - 1987-10-30 - 1 - 0 - 2 - 2012-10-02 - 2 - 2023-03-28 - 6 - 7 - 37 - 90.85 - 28.5 - 1 - 1 - UNA - 1 - - - 1 - QK8499162867 - 0 - 1968-11-27 - ABAN - 2 - 0 - 6 - 2018-08-03 - 9 - 2023-03-28 - 2 - 9 - 0.078464 - 43.02 - 154.7 - 1 - 1 - - - 0 - Wr5514040878 - 0 - 0 - AOTH - Institution Name - 1 - 0 - 2 - 2015-04-24 - 9 - 2023-03-28 - 1 - 6 - 3.51 - 424.0 - 2 - - - 0 - Aj9242652291 - 0.859218 - 1968-12-31 - 0 - BCRB - Institution Name - 1 - 1 - 1 - 2003-09-12 - 5 - 0.320526 - 85 - 98.22 - 206.6 - 2 - - - 0 - Jv2635496195 - 0.021911 - 1977-06-27 - REFU - Institution Name - 1 - 6 - 2022-10-08 - 6 - 0.69819 - 25 - 29.19 - 1 - 1 - SIC - - - 1 - To5555885076 - 0.786453 - 1996-11-18 - 0 - MWAS - Institution Name - 3 - 0 - 4 - 2023-03-28 - 1 - 0.441344 - 83 - 78.29 - 364.4 - 1 - - - 0 - rK9218104079 - 0.491425 - 1998-04-15 - 1 - 3 - 0 - 2023-03-28 - 4 - 0.939826 - 3.1 - 415.3 - 4 - - - 1 - cD9282390165 - 0.192894 - 1959-09-25 - 0 - REFU - Institution Name - 2 - 1 - 3 - 1985-12-12 - 9 - 0.18449 - 14 - 188.4 - 0 - 1 - - - 0 - zU6140515687 - 0 - 1962-11-04 - WBRI - 3 - 1 - 3 - 1999-07-14 - 2018-08-20 - 8 - 3 - 0.222573 - 65 - 16.26 - 1 - - - 1 - ih3342923522 - 0.862474 - 1992-02-18 - 0 - WBRI - 3 - 1 - 2023-03-28 - 4 - 0.761443 - 39 - 0 - 2 - - - 1 - cm3809724991 - 0 - 2001-10-29 - 1 - AIND - 3 - 0 - 1 - 2023-03-28 - 5 - 4 - 0.530908 - 29 - 38.71 - 339.9 - 0 - - - 1 - PA8564166424 - 0.668266 - 1983-04-13 - 9 - Institution Name - 1 - 1 - 2023-03-28 - 3 - 0.707445 - 1 - 2 - - - 0 - QW8564363911 - 0.978729 - 1958-04-26 - 9 - MWBA - Institution Name - 1 - 2002-01-31 - 1 - 0.698641 - 121.9 - 1 - 3 - - - 1 - PQ5842914246 - 0 - 1989-06-05 - 1 - 1 - 1 - 2011-08-31 - 5 - 2023-03-28 - 7 - 9 - 0.443976 - 70 - 12.2 - 301.3 - 0 - 4 - - - 0 - ZQ9393137749 - 0 - 1981-09-21 - CHNE - 1 - 1 - 2001-02-10 - 6 - 2023-03-28 - 4 - 1 - 0.821627 - 94.67 - 471.5 - 0 - 2 - - - 1 - Pv9093835426 - 0.561974 - OOTH - Institution Name - 2 - 0 - 6 - 2014-09-30 - 4 - 0.965936 - 63 - 87.59 - 0 - SIC - 1 - - - 0 - eW7601111729 - 0 - Institution Name - 3 - 0 - 1 - 1993-04-18 - 3 - 2023-03-28 - 7 - 1 - 0.63075 - 80 - 299.1 - 0 - 4 - - - 0 - Jd1465867330 - 0.034436 - 2 - APKN - Institution Name - 1 - 1997-01-11 - 4 - 0.22182 - 23 - 83.01 - 0 - 3 - - - 1 - od1620971821 - 0 - 1975-01-19 - 9 - WOTH - Institution Name - 2 - 0 - 1 - 2016-08-20 - 9 - 2023-03-28 - 5 - 9 - 87 - 13.01 - 1 - -
\ No newline at end of file diff --git a/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml b/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml deleted file mode 100644 index 8c3fc4a5..00000000 --- a/liiatools/spec/social_work_workforce/samples/csww/NEW/social_work_workforce_2022.xml +++ /dev/null @@ -1,556 +0,0 @@ - -
- - CSWW - 2022 - 2022-09-30 - - - L - 316 - liiatools.datasets.social_work_workforce.sample_data - 2023-03-28T14:54:55Z - -
- - 99.68 - 75.82 - 142 - - - 1 - Oy2054309383 - 0.521371 - 1977-08-10 - 1 - REFU - Institution Name - 0 - 1988-04-07 - 9 - 72 - 15.31 - 288.7 - 1 - 0 - TRN - 1 - - - 1 - Yk7226043359 - 0 - 1958-04-07 - 9 - Institution Name - 1 - 2 - 8 - 2019-09-23 - 7 - 10 - 0.603665 - 66 - 29.87 - 2.5 - 1 - 4 - - - 1 - iP8098309864 - 0 - 1984-01-12 - APKN - Institution Name - 0 - 5 - 2014-01-26 - 4 - 2023-03-28 - 3 - 9 - 0.23246 - 92.56 - 213.4 - 0 - 2 - - - 0 - oP8178849586 - 0.899676 - 1990-09-28 - 9 - BAFR - 1 - 0 - 6 - 2023-03-28 - 2 - 0.429963 - 14.39 - 0 - - - 1 - nH9419631053 - 0.133587 - 2 - AIND - Institution Name - 3 - 1 - 2017-06-10 - 0.436348 - 5.39 - 475.7 - 1 - UNP - 1 - - - 1 - JJ3661684122 - 0 - 1993-05-19 - 9 - 3 - 2020-06-14 - 3 - 2023-03-28 - 1 - 5 - 0.903669 - 11 - 141.0 - 1 - - - 1 - tN2120744892 - 0.803122 - WBRI - Institution Name - 3 - 0.964327 - 95.06 - 403.6 - 0 - 2 - - - 1 - Zo9779760045 - 0.767688 - 1996-08-31 - 0 - MWAS - 2 - 1 - 5 - 2023-03-28 - 62 - 1 - 2 - - - 0 - wf3752370095 - 0.843488 - 1959-04-17 - 2 - APKN - Institution Name - 3 - 0 - 1997-10-01 - 2 - 0.712824 - 16.74 - 456.3 - 0 - 4 - - - 1 - OW2475789301 - 0 - 1971-10-02 - Institution Name - 1 - 1 - 1993-10-04 - 2023-03-28 - 3 - 6 - 0.908092 - 45 - 22.98 - 441.5 - 0 - 3 - - - 1 - Kv3016593719 - 0.12232 - 1996-06-05 - 1 - BAFR - 2 - 6 - 10 - 0.641824 - 23 - 36.13 - 213.1 - 0 - - - 0 - TB9669555723 - 0 - 1987-10-30 - 1 - 0 - 2 - 2012-10-02 - 2 - 2023-03-28 - 6 - 7 - 37 - 90.85 - 28.5 - 1 - 1 - UNA - 1 - - - 1 - QK8499162867 - 0 - 1968-11-27 - ABAN - 2 - 0 - 6 - 2018-08-03 - 9 - 2023-03-28 - 2 - 9 - 0.078464 - 43.02 - 154.7 - 1 - 1 - - - 0 - Wr5514040878 - 0 - 0 - AOTH - Institution Name - 1 - 0 - 2 - 2015-04-24 - 9 - 2023-03-28 - 1 - 6 - 3.51 - 424.0 - 2 - - - 0 - Aj9242652291 - 0.859218 - 1968-12-31 - 0 - BCRB - Institution Name - 1 - 1 - 1 - 2003-09-12 - 5 - 0.320526 - 85 - 98.22 - 206.6 - 2 - - - 0 - Jv2635496195 - 0.021911 - 1977-06-27 - REFU - Institution Name - 1 - 6 - 2022-10-08 - 6 - 0.69819 - 25 - 29.19 - 1 - 1 - SIC - - - 1 - To5555885076 - 0.786453 - 1996-11-18 - 0 - MWAS - Institution Name - 3 - 0 - 4 - 2023-03-28 - 1 - 0.441344 - 83 - 78.29 - 364.4 - 1 - - - 0 - rK9218104079 - 0.491425 - 1998-04-15 - 1 - 3 - 0 - 2023-03-28 - 4 - 0.939826 - 3.1 - 415.3 - 4 - - - 1 - cD9282390165 - 0.192894 - 1959-09-25 - 0 - REFU - Institution Name - 2 - 1 - 3 - 1985-12-12 - 9 - 0.18449 - 14 - 188.4 - 0 - 1 - - - 0 - zU6140515687 - 0 - 1962-11-04 - WBRI - 3 - 1 - 3 - 1999-07-14 - 2018-08-20 - 8 - 3 - 0.222573 - 65 - 16.26 - 1 - - - 1 - ih3342923522 - 0.862474 - 1992-02-18 - 0 - WBRI - 3 - 1 - 2023-03-28 - 4 - 0.761443 - 39 - 0 - 2 - - - 1 - cm3809724991 - 0 - 2001-10-29 - 1 - AIND - 3 - 0 - 1 - 2023-03-28 - 5 - 4 - 0.530908 - 29 - 38.71 - 339.9 - 0 - - - 1 - PA8564166424 - 0.668266 - 1983-04-13 - 9 - Institution Name - 1 - 1 - 2023-03-28 - 3 - 0.707445 - 1 - 2 - - - 0 - QW8564363911 - 0.978729 - 1958-04-26 - 9 - MWBA - Institution Name - 1 - 2002-01-31 - 1 - 0.698641 - 121.9 - 1 - 3 - - - 1 - PQ5842914246 - 0 - 1989-06-05 - 1 - 1 - 1 - 2011-08-31 - 5 - 2023-03-28 - 7 - 9 - 0.443976 - 70 - 12.2 - 301.3 - 0 - 4 - - - 0 - ZQ9393137749 - 0 - 1981-09-21 - CHNE - 1 - 1 - 2001-02-10 - 6 - 2023-03-28 - 4 - 1 - 0.821627 - 94.67 - 471.5 - 0 - 2 - - - 1 - Pv9093835426 - 0.561974 - OOTH - Institution Name - 2 - 0 - 6 - 2014-09-30 - 4 - 0.965936 - 63 - 87.59 - 0 - SIC - 1 - - - 0 - eW7601111729 - 0 - Institution Name - 3 - 0 - 1 - 1993-04-18 - 3 - 2023-03-28 - 7 - 1 - 0.63075 - 80 - 299.1 - 0 - 4 - - - 0 - Jd1465867330 - 0.034436 - 2 - APKN - Institution Name - 1 - 1997-01-11 - 4 - 0.22182 - 23 - 83.01 - 0 - 3 - - - 1 - od1620971821 - 0 - 1975-01-19 - 9 - WOTH - Institution Name - 2 - 0 - 1 - 2016-08-20 - 9 - 2023-03-28 - 5 - 9 - 87 - 13.01 - 1 - -
\ No newline at end of file diff --git a/liiatools/spec/social_work_workforce/samples/flatfiles/BAD/la_log/blank.txt b/liiatools/spec/social_work_workforce/samples/flatfiles/BAD/la_log/blank.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/liiatools/spec/social_work_workforce/samples/log_files/blank.txt b/liiatools/spec/social_work_workforce/samples/log_files/blank.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/liiatools/spec/social_work_workforce/samples/outputs/blank.txt b/liiatools/spec/social_work_workforce/samples/outputs/blank.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/liiatools/spec/social_work_workforce/samples/request/blank.txt b/liiatools/spec/social_work_workforce/samples/request/blank.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml b/liiatools/spec/social_work_workforce/samples/social_work_workforce_2022.xml similarity index 100% rename from liiatools/spec/social_work_workforce/samples/csww/BAD/social_work_workforce_2022.xml rename to liiatools/spec/social_work_workforce/samples/social_work_workforce_2022.xml diff --git a/liiatools/spec/social_work_workforce/social_work_workforce_2021.xsd b/liiatools/spec/social_work_workforce/social_work_workforce_2021.xsd deleted file mode 100644 index bc9f98a3..00000000 --- a/liiatools/spec/social_work_workforce/social_work_workforce_2021.xsd +++ /dev/null @@ -1,254 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Not an Agency Worker - Agency Worker - - - - - - - - - - - - - - - - - - - - Not Known - Male - Female - Not Specified - - - - - - White British - White Irish - Any Other White Background - White and Black Caribbean - White and Black African - White and Asian - Any Other Mixed Background - Indian - Pakistani - Bangladeshi - Any Other Asian Background - Black Caribbean - Black African - Any Other Black Background - Chinese - Any Other Ethnic Group - Declared not stated or Refused - Information Not Yet Obtained - - - - - - Under-graduate - Post-graduate - Other - - - - - - No - Yes - - - - - - Senior Manager - Middle Manager - First Line Manager - Senior Practicioner - Case Holder - Qualified Without Cases - - - - - - Newly Qualified Social Workers - Social Worker Role in a Different Local Authority in England - Social Worker Role Outside England - Agency or Consultancy in Social Work (in England) - Other Social Work Role Non-local Authority (in England) - Other Social Care Role in Local Authority/Non-local Authority (in England) - Non-social Care Role/Any Role Outside England/No Employment/Career Break - Other - Not Known - Not Yet Collected - - - - - - Social Worker Role in a Different Local Authority in England - Social Worker Role Outside England - Agency or Consultancy in Social Work (in England) - Other Social Work Role Non-local Authority (in England) - Other Social Care Role in Local Authority/Non-local Authority (in England) - Non-social Care Role/Any Role Outside England/No Employment/Career Break - Other - Not Known - Not Yet Collected - - - - - - Resignation - Voluntary Redundancy - Compulsory Redundancy - Dismissed - Retired - Deceased - Moved to a Non-child and Family Social Work Role Within LA - Other - Not Known - Not Yet Collected - - - - - - Maternity/Paternity leave - Other Paid Authorised Absence, Such As: Compassionate Leave, Annual Leave Requiring Reallocation Of Cases - Paid Absence For Public Duties, Such As: Jury Duty - Sick Leave - Training - Unauthorised Absence - Unpaid Authorised Absence - - - - - - Assessed and Supported Year in Employment (AYSE) - Frontline Practitioner - Practice Supervisor - Practice Leader - - - - \ No newline at end of file