diff --git a/pyxform/builder.py b/pyxform/builder.py index cf2f1ea1..95e52936 100644 --- a/pyxform/builder.py +++ b/pyxform/builder.py @@ -7,7 +7,7 @@ import re from pyxform import file_utils, utils -from pyxform.entity_declaration import EntityDeclaration +from pyxform.entities.entity_declaration import EntityDeclaration from pyxform.errors import PyXFormError from pyxform.external_instance import ExternalInstance from pyxform.question import ( diff --git a/pyxform/constants.py b/pyxform/constants.py index 546b51e8..e159749f 100644 --- a/pyxform/constants.py +++ b/pyxform/constants.py @@ -124,3 +124,8 @@ ROW_FORMAT_STRING: str = "[row : %s]" XML_IDENTIFIER_ERROR_MESSAGE = "must begin with a letter, colon, or underscore. Other characters can include numbers, dashes, and periods." +_MSG_SUPPRESS_SPELLING = ( + " If you do not mean to include a sheet, to suppress this message, " + "prefix the sheet name with an underscore. For example 'setting' " + "becomes '_setting'." +) diff --git a/pyxform/entities/entities_parsing.py b/pyxform/entities/entities_parsing.py new file mode 100644 index 00000000..7f6b9460 --- /dev/null +++ b/pyxform/entities/entities_parsing.py @@ -0,0 +1,90 @@ +from typing import Dict, List + +from pyxform import constants +from pyxform.errors import PyXFormError +from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag + + +def get_entity_declaration(workbook_dict: Dict, warnings: List) -> Dict: + entities_sheet = workbook_dict.get(constants.ENTITIES, []) + + if len(entities_sheet) == 0: + similar = find_sheet_misspellings( + key=constants.ENTITIES, keys=workbook_dict.keys() + ) + if similar is not None: + warnings.append(similar + constants._MSG_SUPPRESS_SPELLING) + return {} + elif len(entities_sheet) > 1: + raise PyXFormError( + "This version of pyxform only supports declaring a single entity per form. Please make sure your entities sheet only declares one entity." + ) + + entity = entities_sheet[0] + dataset = entity["dataset"] + + if dataset.startswith(constants.ENTITIES_RESERVED_PREFIX): + raise PyXFormError( + f"Invalid dataset name: '{dataset}' starts with reserved prefix {constants.ENTITIES_RESERVED_PREFIX}." + ) + + if not is_valid_xml_tag(dataset): + if isinstance(dataset, bytes): + dataset = dataset.encode("utf-8") + + raise PyXFormError( + f"Invalid dataset name: '{dataset}'. Dataset names {constants.XML_IDENTIFIER_ERROR_MESSAGE}" + ) + + if not ("label" in entity): + raise PyXFormError("The entities sheet is missing the required label column.") + + creation_condition = entity["create_if"] if "create_if" in entity else "1" + + return { + "name": "entity", + "type": "entity", + "parameters": { + "dataset": dataset, + "create": creation_condition, + "label": entity["label"], + }, + } + + +def validate_entity_saveto(row: Dict, row_number: int, entity_declaration: Dict): + save_to = row.get("bind", {}).get("entities:saveto", "") + if not save_to: + return + + if len(entity_declaration) == 0: + raise PyXFormError( + "To save entity properties using the save_to column, you must add an entities sheet and declare an entity." + ) + + if constants.GROUP in row.get(constants.TYPE) or constants.REPEAT in row.get( + constants.TYPE + ): + raise PyXFormError( + f"{constants.ROW_FORMAT_STRING % row_number} Groups and repeats can't be saved as entity properties." + ) + + error_start = f"{constants.ROW_FORMAT_STRING % row_number} Invalid save_to name:" + + if save_to == "name" or save_to == "label": + raise PyXFormError( + f"{error_start} the entity property name '{save_to}' is reserved." + ) + + if save_to.startswith(constants.ENTITIES_RESERVED_PREFIX): + raise PyXFormError( + f"{error_start} the entity property name '{save_to}' starts with reserved prefix {constants.ENTITIES_RESERVED_PREFIX}." + ) + + if not is_valid_xml_tag(save_to): + if isinstance(save_to, bytes): + save_to = save_to.encode("utf-8") + + raise PyXFormError( + f"{error_start} '{save_to}'. Entity property names {constants.XML_IDENTIFIER_ERROR_MESSAGE}" + ) diff --git a/pyxform/entity_declaration.py b/pyxform/entities/entity_declaration.py similarity index 100% rename from pyxform/entity_declaration.py rename to pyxform/entities/entity_declaration.py diff --git a/pyxform/survey_element.py b/pyxform/survey_element.py index f963fb55..ade87aba 100644 --- a/pyxform/survey_element.py +++ b/pyxform/survey_element.py @@ -14,10 +14,10 @@ BRACKETED_TAG_REGEX, INVALID_XFORM_TAG_REGEXP, default_is_dynamic, - is_valid_xml_tag, node, ) from pyxform.xls2json import print_pyobj_to_json +from pyxform.xlsparseutils import is_valid_xml_tag if TYPE_CHECKING: from typing import List diff --git a/pyxform/utils.py b/pyxform/utils.py index 73a53051..de5c7fde 100644 --- a/pyxform/utils.py +++ b/pyxform/utils.py @@ -20,11 +20,6 @@ SEP = "_" -# http://www.w3.org/TR/REC-xml/ -TAG_START_CHAR = r"[a-zA-Z:_]" -TAG_CHAR = r"[a-zA-Z:_0-9\-.]" -XFORM_TAG_REGEXP = "%(start)s%(char)s*" % {"start": TAG_START_CHAR, "char": TAG_CHAR} - INVALID_XFORM_TAG_REGEXP = r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*" LAST_SAVED_INSTANCE_NAME = "__last-saved" @@ -67,13 +62,6 @@ def writexml(self, writer, indent="", addindent="", newl=""): writer.write(data) -def is_valid_xml_tag(tag): - """ - Use a regex to see if there are any invalid characters (i.e. spaces). - """ - return re.search(r"^" + XFORM_TAG_REGEXP + r"$", tag) - - def node(*args, **kwargs): """ args[0] -- a XML tag diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py index 59ded054..014411fd 100644 --- a/pyxform/xls2json.py +++ b/pyxform/xls2json.py @@ -12,26 +12,25 @@ from pyxform import aliases, constants from pyxform.constants import ( - ENTITIES_RESERVED_PREFIX, + _MSG_SUPPRESS_SPELLING, EXTERNAL_INSTANCE_EXTENSIONS, ROW_FORMAT_STRING, - TYPE, XML_IDENTIFIER_ERROR_MESSAGE, ) +from pyxform.entities.entities_parsing import ( + get_entity_declaration, + validate_entity_saveto, +) from pyxform.errors import PyXFormError -from pyxform.utils import default_is_dynamic, is_valid_xml_tag, levenshtein_distance +from pyxform.utils import default_is_dynamic from pyxform.validators.pyxform import parameters_generic, select_from_file_params from pyxform.validators.pyxform.missing_translations_check import ( missing_translations_check, ) from pyxform.xls2json_backends import csv_to_dict, xls_to_dict, xlsx_to_dict +from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag SMART_QUOTES = {"\u2018": "'", "\u2019": "'", "\u201c": '"', "\u201d": '"'} -_MSG_SUPPRESS_SPELLING = ( - " If you do not mean to include a sheet, to suppress this message, " - "prefix the sheet name with an underscore. For example 'setting' " - "becomes '_setting'." -) def print_pyobj_to_json(pyobj, path=None): @@ -323,117 +322,6 @@ def process_image_default(default_value): return default_value -def find_sheet_misspellings(key: str, keys: "KeysView") -> "Optional[str]": - """ - Find possible sheet name misspellings to warn the user about. - - It's possible that this will warn about sheet names for sheets that have - auxilliary metadata that is not meant for processing by pyxform. For - example the "osm" sheet name may be similar to many other initialisms. - - :param key: The sheet name to look for. - :param keys: The workbook sheet names. - """ - candidates = tuple( - _k # thanks to black - for _k in keys - if 2 >= levenshtein_distance(_k.lower(), key) - and _k not in constants.SUPPORTED_SHEET_NAMES - and not _k.startswith("_") - ) - if 0 < len(candidates): - msg = ( - "When looking for a sheet named '{k}', the following sheets with " - "similar names were found: {c}." - ).format(k=key, c=str(", ".join(("'{}'".format(c) for c in candidates)))) - return msg - else: - return None - - -def get_entity_declaration(workbook_dict: Dict, warnings: List) -> Dict: - entities_sheet = workbook_dict.get(constants.ENTITIES, []) - - if len(entities_sheet) == 0: - similar = find_sheet_misspellings( - key=constants.ENTITIES, keys=workbook_dict.keys() - ) - if similar is not None: - warnings.append(similar + _MSG_SUPPRESS_SPELLING) - return {} - elif len(entities_sheet) > 1: - raise PyXFormError( - "This version of pyxform only supports declaring a single entity per form. Please make sure your entities sheet only declares one entity." - ) - - entity = entities_sheet[0] - dataset = entity["dataset"] - - if dataset.startswith(ENTITIES_RESERVED_PREFIX): - raise PyXFormError( - f"Invalid dataset name: '{dataset}' starts with reserved prefix {ENTITIES_RESERVED_PREFIX}." - ) - - if not is_valid_xml_tag(dataset): - if isinstance(dataset, bytes): - dataset = dataset.encode("utf-8") - - raise PyXFormError( - f"Invalid dataset name: '{dataset}'. Dataset names {XML_IDENTIFIER_ERROR_MESSAGE}" - ) - - if not ("label" in entity): - raise PyXFormError("The entities sheet is missing the required label column.") - - creation_condition = entity["create_if"] if "create_if" in entity else "1" - - return { - "name": "entity", - "type": "entity", - "parameters": { - "dataset": dataset, - "create": creation_condition, - "label": entity["label"], - }, - } - - -def validate_entity_saveto(row: Dict, row_number: int, entity_declaration: Dict): - save_to = row.get("bind", {}).get("entities:saveto", "") - if not save_to: - return - - if len(entity_declaration) == 0: - raise PyXFormError( - "To save entity properties using the save_to column, you must add an entities sheet and declare an entity." - ) - - if constants.GROUP in row.get(TYPE) or constants.REPEAT in row.get(TYPE): - raise PyXFormError( - f"{ROW_FORMAT_STRING % row_number} Groups and repeats can't be saved as entity properties." - ) - - error_start = f"{ROW_FORMAT_STRING % row_number} Invalid save_to name:" - - if save_to == "name" or save_to == "label": - raise PyXFormError( - f"{error_start} the entity property name '{save_to}' is reserved." - ) - - if save_to.startswith(ENTITIES_RESERVED_PREFIX): - raise PyXFormError( - f"{error_start} the entity property name '{save_to}' starts with reserved prefix {ENTITIES_RESERVED_PREFIX}." - ) - - if not is_valid_xml_tag(save_to): - if isinstance(save_to, bytes): - save_to = save_to.encode("utf-8") - - raise PyXFormError( - f"{error_start} '{save_to}'. Entity property names {XML_IDENTIFIER_ERROR_MESSAGE}" - ) - - def workbook_to_json( workbook_dict, form_name=None, diff --git a/pyxform/xlsparseutils.py b/pyxform/xlsparseutils.py new file mode 100644 index 00000000..9a10a770 --- /dev/null +++ b/pyxform/xlsparseutils.py @@ -0,0 +1,45 @@ +import re +from pyxform import constants +from typing import KeysView, Optional + +from pyxform.utils import levenshtein_distance + +# http://www.w3.org/TR/REC-xml/ +TAG_START_CHAR = r"[a-zA-Z:_]" +TAG_CHAR = r"[a-zA-Z:_0-9\-.]" +XFORM_TAG_REGEXP = "%(start)s%(char)s*" % {"start": TAG_START_CHAR, "char": TAG_CHAR} + + +def find_sheet_misspellings(key: str, keys: "KeysView") -> "Optional[str]": + """ + Find possible sheet name misspellings to warn the user about. + + It's possible that this will warn about sheet names for sheets that have + auxilliary metadata that is not meant for processing by pyxform. For + example the "osm" sheet name may be similar to many other initialisms. + + :param key: The sheet name to look for. + :param keys: The workbook sheet names. + """ + candidates = tuple( + _k # thanks to black + for _k in keys + if 2 >= levenshtein_distance(_k.lower(), key) + and _k not in constants.SUPPORTED_SHEET_NAMES + and not _k.startswith("_") + ) + if 0 < len(candidates): + msg = ( + "When looking for a sheet named '{k}', the following sheets with " + "similar names were found: {c}." + ).format(k=key, c=str(", ".join(("'{}'".format(c) for c in candidates)))) + return msg + else: + return None + + +def is_valid_xml_tag(tag): + """ + Use a regex to see if there are any invalid characters (i.e. spaces). + """ + return re.search(r"^" + XFORM_TAG_REGEXP + r"$", tag)