diff --git a/CHANGELOG.md b/CHANGELOG.md index 73548cb..e815347 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## 0.3.0 + +- add bioscan manifest v3 support and set it as template, v2 input still supported +- amend STS output to match v3 manifest, allowing for both v2 and v3 manifests conversion into the same STS output +- regex validations for all format-sensitive fields including dates, reimplement date comparison +- date cleanup from time addition by Excel-to-pandas conversion +- validate identifier against contributors +- validate input filename +- continued error message clarifications +- logic updates, bug fixes, general refactoring +- tested on backlog pre-release ## 0.2.0 diff --git a/data/BIOSCAN_Manifest_SOP_V3_20230818.docx b/data/BIOSCAN_Manifest_SOP_V3_20230818.docx new file mode 100644 index 0000000..2e37a71 Binary files /dev/null and b/data/BIOSCAN_Manifest_SOP_V3_20230818.docx differ diff --git a/data/BIOSCAN_Manifest_V2.0_20230727.xlsx b/data/BIOSCAN_Manifest_V2.0_20230727.xlsx index 57abb3f..07e30b2 100644 Binary files a/data/BIOSCAN_Manifest_V2.0_20230727.xlsx and b/data/BIOSCAN_Manifest_V2.0_20230727.xlsx differ diff --git a/data/BIOSCAN_Manifest_V3_20230818.xlsx b/data/BIOSCAN_Manifest_V3_20230818.xlsx new file mode 100644 index 0000000..0a8a62a Binary files /dev/null and b/data/BIOSCAN_Manifest_V3_20230818.xlsx differ diff --git a/work/validate_anospp.ipynb b/work/validate_anospp.ipynb index 9e41e8e..ea451f1 100644 --- a/work/validate_anospp.ipynb +++ b/work/validate_anospp.ipynb @@ -27,7 +27,7 @@ "source": [ "def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx', \n", " verbose=False, samples_sheet='TAB 2 Metadata Entry',\n", - " contrib_sheet='TAB 1 Contributors'):\n", + " contrib_sheet='TAB 1 Contributors', write_sts=True):\n", " '''\n", " ANOSPP partner manifest validation\n", " Validation follows the order of columns order in data entry sheet\n", @@ -35,12 +35,13 @@ "\n", " setup_logging(verbose=verbose)\n", "\n", - " logging.warning(f'# partner manifest validation v.{VALIDATION_VERSION}')\n", - " logging.warning(f'# validating ANOSPP manifest v.{ANOSPP_VERSION}')\n", - " logging.warning(f'# manifest {fn}')\n", + " logging.info(f'# partner manifest validation v.{VALIDATION_VERSION}')\n", + " logging.info(f'# validating against ANOSPP manifest v.{ANOSPP_VERSION}')\n", + " logging.info(f'# manifest \"{fn}\"')\n", "\n", " # read data\n", " df = get_data(fn, sheet=samples_sheet)\n", + " df = fix_date_formats(df)\n", " # validate series, exclude non-numeric\n", " df = validate_series(df)\n", " # clean up data\n", @@ -69,9 +70,9 @@ " validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)\n", " validate_values('ORGANISM_PART', df, valid_dict, sep='|')\n", " # columns below validated for non-blank samples only\n", - " date_coll = validate_date('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", - " validate_float('DECIMAL_LATITUDE', df[~is_blank])\n", - " validate_float('DECIMAL_LONGITUDE', df[~is_blank])\n", + " validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", + " validate_regex('DECIMAL_LATITUDE', df[~is_blank])\n", + " validate_regex('DECIMAL_LONGITUDE', df[~is_blank])\n", " # COLLECTION_COUNTRY, DECIMAL_LATITUDE, DECIMAL_LONGITUDE\n", " validate_country_and_coordinates(df[~is_blank], fn, na_values=[''])\n", " # COLLECTION_LOCATION not checked\n", @@ -84,8 +85,8 @@ " validate_freetext('IDENTIFIED_HOW', df[~is_blank])\n", " validate_values('LIFESTAGE', df[~is_blank], valid_dict)\n", " validate_values('SEX', df[~is_blank], valid_dict, na_values = [''])\n", - " validate_time('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", - " validate_time_period('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", + " validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", + " validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", " validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)\n", " validate_values('OUTDOORS_INDOORS', df[~is_blank], valid_dict, na_values = [''])\n", " validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df[~is_blank])\n", @@ -95,22 +96,29 @@ " validate_values('BLOOD_MEAL', df, valid_dict, na_values=[''])\n", " validate_values('GRAVIDITY', df, valid_dict, na_values=[''])\n", " validate_freetext('HABITAT', df)\n", - " date_pres = validate_date('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION\n", - " compare_dates(before=date_coll, after=date_pres)\n", - " validate_float('ELEVATION', df, na_values=[''])\n", - " validate_wtw('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n", + " validate_regex('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION\n", + " compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PRESERVATION', df[~is_blank])\n", + " validate_regex('ELEVATION', df, na_values=[''])\n", + " validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n", " validate_freetext('OTHER_ORGANISMS', df)\n", " validate_freetext('BIOASSAYS', df)\n", " validate_freetext('COLLECTOR_SAMPLE_ID', df)\n", " validate_freetext('OTHER_INFORMATION', df)\n", - " validate_freetext('MISC_METADATA', df)\n", + " # MISC_METADATA can be removed safely\n", + " if 'MISC_METADATA' in df.columns:\n", + " validate_freetext('MISC_METADATA', df)\n", " validate_freetext('DNA_EXTRACTION_DESCRIPTION', df)\n", - " validate_float('DNA_EXTRACT_VOLUME_PROVIDED', df, na_values=[''])\n", - " validate_float('DNA_EXTRACT_CONCENTRATION', df, na_values=[''])\n", + " validate_regex('DNA_EXTRACT_VOLUME_PROVIDED', df, na_values=[''])\n", + " validate_regex('DNA_EXTRACT_CONCENTRATION', df, na_values=[''])\n", " \n", - " logging.warning('# ended validate_anospp_partner_manifest_v.{}'.format(ANOSPP_VERSION))\n", + " \n", + " \n", + " df = add_sts_cols(df, contrib_df, gal, bioscan=False, v='NA')\n", + " if write_sts:\n", + " write_sts_manifest(df, fn, VALIDATION_VERSION)\n", "\n", - " # TODO yield table ready for STS submission\n", + " logging.info('# ended validate_anospp_partner_manifest')\n", + " \n", " return df\n", "\n", "fn = '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx'\n", @@ -152,14 +160,14 @@ "metadata": {}, "outputs": [], "source": [ - "fn = '../results/20230725_aydi_akje_re/Manifest-MADAGASCAR 2023 II.xlsx'\n", - "df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributor')" + "fn = '../results/20230814_olaitan_2_re_re/Anopheles_Metadata_Manifest_V4.0_EKUW - 14.08.2023 - antivec samples.xlsx'\n", + "df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors')" ] }, { "cell_type": "code", "execution_count": null, - "id": "0ff01236", + "id": "520423bd", "metadata": {}, "outputs": [], "source": [] diff --git a/work/validate_bioscan.ipynb b/work/validate_bioscan.ipynb index b431891..48c9540 100644 --- a/work/validate_bioscan.ipynb +++ b/work/validate_bioscan.ipynb @@ -17,7 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_bioscan(fn, template_fn='../data/BIOSCAN_Manifest_V2.0_20230727.xlsx', \n", + "def validate_bioscan(fn, template_fn='../data/BIOSCAN_Manifest_V3_20230818.xlsx', \n", " samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', \n", " verbose=False, write_sts=True):\n", " '''\n", @@ -27,12 +27,15 @@ "\n", " setup_logging(verbose=verbose)\n", "\n", - " logging.warning(f'# partner manifest validation v.{VALIDATION_VERSION}')\n", - " logging.warning(f'# validating BIOSCAN manifest v.{BIOSCAN_VERSION}')\n", - " logging.warning(f'# manifest {fn}')\n", + " logging.info(f'# partner manifest validation v.{VALIDATION_VERSION}')\n", + " logging.info(f'# validating against BIOSCAN manifest v.{BIOSCAN_VERSION}')\n", + " logging.info(f'# manifest \"{fn}\"')\n", "\n", " # read data\n", " df = get_data(fn, sheet=samples_sheet)\n", + " df = fix_date_formats(df)\n", + " # check manifest verison\n", + " v = infer_bioscan_version(df)\n", " # check series, exclude non-numeric\n", " df = validate_series(df)\n", " # clean up data\n", @@ -44,36 +47,43 @@ " \n", " # prepare for validation\n", " template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')\n", - " check_columns(df, template_df)\n", + " check_columns(df, template_df, bioscan_version=v)\n", " valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4 DO NOT EDIT - Data Valida')\n", " contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)\n", "\n", " # orange cols\n", - " validate_catch_lot('CATCH_LOT', df, na_values=[''])\n", - " df, gal = validate_plates_wells(df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True)\n", + " validate_regex('CATCH_LOT', df, na_values=[])\n", + " df, gal, partner_code = validate_plates_wells(\n", + " df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True, bioscan_version=v)\n", " # check and exclude blanks\n", " is_blank = check_blanks(df)\n", " if df[~is_blank].shape[0] == 0:\n", " logging.error('no non-blank samples to validate, terminating')\n", " return df\n", " validate_values('ORGANISM_PART', df, valid_dict, sep='|')\n", + " df = strip_asterisks('ORGANISM_PART', df)\n", " validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)\n", + " df = strip_asterisks('PRESERVATIVE_SOLUTION', df)\n", " # columns below validated for non-blank samples only\n", + " if v == 'v3':\n", + " validate_values('CATCH_SOLUTION', df[~is_blank], valid_dict)\n", + " df = strip_asterisks('CATCH_SOLUTION', df)\n", " validate_values('BOTTLE_DIRECTION', df[~is_blank], valid_dict) # TODO allow for blank in non-Malaise trap samples\n", - " date_coll = validate_date('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", - " validate_float('DECIMAL_LATITUDE', df[~is_blank])\n", - " validate_float('DECIMAL_LONGITUDE', df[~is_blank])\n", + " validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])\n", + " validate_regex('DECIMAL_LATITUDE', df[~is_blank], na_values=[])\n", + " validate_regex('DECIMAL_LONGITUDE', df[~is_blank], na_values=[])\n", " # COUNTRY_OF_COLLECTION, DECIMAL_LATITUDE, DECIMAL_LONGITUDE\n", " validate_country_and_coordinates(df[~is_blank], fn, na_values=[''], bioscan=True)\n", " # COLLECTION_LOCATION not checked\n", " \n", " # purple cols - valiated for non-blank samples\n", - " validate_wtw('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n", - " validate_time('TIME_OF_COLLECTION', df[~is_blank], bioscan=True)\n", - " validate_time_period('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])\n", - " validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)\n", - " date_plat = validate_date('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", - " compare_dates(before=date_coll, after=date_plat)\n", + " validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n", + " validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", + " validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", + " validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict, na_values=[''])\n", + " df = strip_asterisks('COLLECTION_METHOD', df)\n", + " validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", + " compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PLATING', df[~is_blank])\n", " # taxonomy validation adds taxid columns to original dataframe - skipping for now\n", " df = validate_taxonomy(df, ncbi, anospp=False, na_values = [''])\n", " validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict, na_values=[''])\n", @@ -83,7 +93,8 @@ " validate_values('SORTING_SOLUTION_USED', df[~is_blank], valid_dict, na_values=[''])\n", " validate_values('CATCH_BOTTLE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])\n", " validate_values('PLATE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])\n", - " \n", + " if v == 'v3':\n", + " validate_values('AMOUNT_OF_CATCH_PLATED', df[~is_blank], valid_dict)\n", " # white cols - validated for all samples\n", " validate_freetext('MORPHOSPECIES_DESCRIPTION', df)\n", " validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df)\n", @@ -92,18 +103,22 @@ " # TODO check if STS will need something here\n", " validate_freetext('COLLECTOR_SAMPLE_ID', df)\n", " validate_freetext('VOUCHER_ID', df)\n", - " validate_float('ELEVATION', df, na_values=[''])\n", + " validate_regex('ELEVATION', df, na_values=[''])\n", " validate_freetext('OTHER_INFORMATION', df)\n", " validate_freetext('MISC_METADATA', df)\n", - " # IDENTIFIED_BY not checked\n", - " # IDENTIFIER_AFFILIATION not checked\n", + " validate_identifier('IDENTIFIED_BY', df, contrib_df, na_values=[''])\n", " \n", " df = expand_plate_only(df)\n", - " df = add_sts_cols(df, contrib_df, gal)\n", + " df = add_sts_cols(df, contrib_df, gal, bioscan=True, v=v)\n", + " \n", + " # keep filename operations together\n", + " validate_input_filename(fn, partner_code, v)\n", " if write_sts:\n", " write_sts_manifest(df, fn, VALIDATION_VERSION)\n", " \n", - " logging.warning('# ended validate_bioscan_partner_manifest_v.{}'.format(BIOSCAN_VERSION))\n", + " logging.info('# ended validation of bioscan partner manifest')\n", + " \n", + " print('\\n'.join(df.RACK_OR_PLATE_ID.unique()))\n", "\n", " return df" ] @@ -111,15 +126,15 @@ { "cell_type": "code", "execution_count": null, - "id": "230f6339", + "id": "c9b0c5c4", "metadata": {}, "outputs": [], "source": [ "# fn = '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'\n", - "df = validate_bioscan(fn='../data/BIOSCAN_Manifest_V2.0_20230727.xlsx', \n", - " template_fn='../data/BIOSCAN_Manifest_V2.0_20230727.xlsx', \n", + "df = validate_bioscan(fn='../data/BIOSCAN_Manifest_V3_20230818.xlsx', \n", + " template_fn='../data/BIOSCAN_Manifest_V3_20230818.xlsx', \n", " verbose=False,\n", - " samples_sheet='TAB 5 DO NOT EDIT - TEST Met',\n", + " samples_sheet='TAB 6 TEST',\n", " write_sts=True)" ] }, @@ -133,6 +148,14 @@ "raise Exception('Setup complete')" ] }, + { + "cell_type": "markdown", + "id": "a030e1bb", + "metadata": {}, + "source": [ + "## Validation" + ] + }, { "cell_type": "code", "execution_count": null, @@ -140,14 +163,14 @@ "metadata": {}, "outputs": [], "source": [ - "fn = '../results/20230712_js_batch1/[NENM]_[2307]_BIOSCAN_Manifest_V2.0.xlsx'\n", + "fn = '../results/20230817_js_batch8/NBGW-2023-Manifest-2023-08-14.xlsx'\n", "df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', verbose=False)" ] }, { "cell_type": "code", "execution_count": null, - "id": "43a0cf51", + "id": "6a82cc40", "metadata": {}, "outputs": [], "source": [] diff --git a/work/validate_partner_manifest_dev.ipynb b/work/validate_partner_manifest_dev.ipynb index 30b7dfa..c4bb069 100644 --- a/work/validate_partner_manifest_dev.ipynb +++ b/work/validate_partner_manifest_dev.ipynb @@ -24,6 +24,7 @@ "import pickle\n", "import ete3\n", "import geopy\n", + "import re\n", "from geopy.geocoders import Nominatim\n", "from geopy.extra.rate_limiter import RateLimiter" ] @@ -34,9 +35,10 @@ "metadata": {}, "outputs": [], "source": [ - "VALIDATION_VERSION = '0.2.0'\n", + "VALIDATION_VERSION = '0.3.0'\n", "ANOSPP_VERSION = '4.0'\n", - "BIOSCAN_VERSION = '2.0'" + "# V2.0, but V3 in SOP\n", + "BIOSCAN_VERSION = '3'" ] }, { @@ -91,7 +93,7 @@ "source": [ "def get_data(fn, sheet='TAB 2 Metadata Entry'):\n", "\n", - " logging.debug(f'reading sample metadata from \"{fn}\" sheet \"{sheet}\"')\n", + " logging.debug(f'reading data from \"{fn}\" sheet \"{sheet}\"')\n", " \n", " df = pd.read_excel(fn, dtype=str, index_col=0, keep_default_na=False,\n", " sheet_name=sheet)\n", @@ -107,7 +109,73 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_series(df):\n", + "def fix_date_formats(df):\n", + " # time can be appended to date by conversion to string in pd.read_excel\n", + " # fixing date formats appearing due to string conversion on reading\n", + " logging.debug('fixing date formats by removing appended times')\n", + " \n", + " for col in ['DATE_OF_COLLECTION', 'DATE_OF_PLATING', 'DATE_OF_PRESERVATION']:\n", + " if col in df.columns:\n", + " df[col] = df[col].str.replace(r' 00:00:00$','')\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def infer_bioscan_version(df):\n", + " \n", + " logging.debug(f'inferring bioscan version from data format')\n", + " \n", + " version_evidence = {}\n", + " \n", + " # columns in v3, not in v2\n", + " for col in ('CATCH_SOLUTION', 'AMOUNT_OF_CATCH_PLATED'):\n", + " if col in df.columns:\n", + " version_evidence[col] = 'v3'\n", + " else:\n", + " version_evidence[col] = 'v2'\n", + " \n", + " # IDENTIFIER_AFFILIATION in v2, not in v3 - not checking as dropped from output\n", + " \n", + " # blank samples conventions\n", + " v3_plate_only = (('PLATE_ONLY_1_BLANK' in df['TUBE_OR_WELL_ID']) \n", + " or ('PLATE_ONLY_2_BLANKS' in df['TUBE_OR_WELL_ID']))\n", + " if 'PLATE_ONLY' in df['TUBE_OR_WELL_ID']:\n", + " if v3_plate_only:\n", + " logging.error('found both bioscan v2 and v3 style PLATE_ONLY entries in manifest')\n", + " raise ValueError('fix bioscan manifest version before proceeding')\n", + " else:\n", + " version_evidence['PLATE_ONLY'] = 'v2'\n", + " elif v3_plate_only:\n", + " version_evidence['PLATE_ONLY'] = 'v3'\n", + " \n", + " # conflicting evidence\n", + " if len(set(version_evidence.values())) > 1:\n", + " msg = 'found conflicting evidence on bioscan version: '\n", + " for k, v in version_evidence.items():\n", + " msg += f'{k} suggests {v}, '\n", + " logging.error(msg[:-2])\n", + " raise ValueError('fix bioscan manifest version before proceeding')\n", + " \n", + " # returns only if all evidence is concordant\n", + " v = version_evidence['CATCH_SOLUTION']\n", + " logging.info(f'bioscan manifest {v} inferred')\n", + " \n", + " return v" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def validate_series(df, plate_col='RACK_OR_PLATE_ID', well_col='TUBE_OR_WELL_ID'):\n", " \n", " # series should be 1,2, ..., nsamples\n", " logging.debug('validating SERIES column')\n", @@ -123,6 +191,12 @@ " f'{df.index[~series_numeric].to_list()} - '\n", " f'these will be excluded from validation and output')\n", " df = df.loc[series_numeric]\n", + " \n", + " # exclude empty rows based on empty \n", + " empty_rows = (df[plate_col] == '') | (df[well_col] == '')\n", + " if empty_rows.any():\n", + " logging.info(f'found and excluded {empty_rows.sum()} rows because {plate_col} or {well_col} is empty')\n", + " df = df.loc[~empty_rows]\n", " \n", " # check the remaining SERIES are continuous\n", " expected_series = set([str(i) for i in range(1, df.shape[0] + 1)])\n", @@ -145,7 +219,7 @@ "def index_ranges(df):\n", " \n", " # based on https://stackoverflow.com/questions/4628333/converting-a-list-of-integers-into-range-in-python\n", - " i = df.index.to_list()\n", + " i = df.index.astype(int).to_list()\n", " ranges = []\n", " for a, b in itertools.groupby(enumerate(i), lambda pair: pair[1] - pair[0]):\n", " b = list(b)\n", @@ -201,16 +275,24 @@ "metadata": {}, "outputs": [], "source": [ - "def check_columns(df, template_df):\n", + "def check_columns(df, template_df, bioscan_version='v3', optional_columns=['MISC_METADATA']):\n", " \n", " logging.debug('checking manifest columns against template')\n", " \n", " data_cols = set(df.columns)\n", " template_cols = set(template_df.columns)\n", + " optional_cols = set(optional_columns)\n", + " \n", + " # assuming validation against v3 template\n", + " if bioscan_version == 'v2':\n", + " template_cols.remove('CATCH_SOLUTION')\n", + " template_cols.remove('AMOUNT_OF_CATCH_PLATED')\n", + " template_cols.update(['IDENTIFIER_AFFILIATION'])\n", " \n", " if data_cols - template_cols != set():\n", " logging.info(f'extra columns in filled manifest compared to template: {data_cols - template_cols}')\n", - " if template_cols - data_cols != set():\n", + " # do not report missing optional columns\n", + " if template_cols - data_cols - optional_cols != set():\n", " logging.error(f'template columns missing from filled manifest: {template_cols - data_cols}')\n", "\n", "# check_columns(df, template_df)" @@ -314,9 +396,9 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_catch_lot(col, df, na_values=[]):\n", + "def validate_regex(col, df, na_values=[]):\n", " \n", - " logging.debug(f'validating catch lot format in {col} column')\n", + " logging.debug(f'validating data format in {col} column')\n", " \n", " if col not in df.columns:\n", " logging.error(f'{col} column not found in manifest')\n", @@ -324,13 +406,44 @@ " series = df[col]\n", " series = exclude_missing(series, na_values)\n", " \n", - " regex = '^C\\d{3}[A-Z]$'\n", + " # dates between 1900 and 2100\n", + " date_regex = (r'^(19\\d\\d|20\\d\\d)-(0[1-9]|1[0-2])-(0[1-9]|[12]\\d|3[01])$|'\n", + " r'^(19\\d\\d|20\\d\\d)-(0[1-9]|1[0-2])$|'\n", + " r'^(19\\d\\d|20\\d\\d)$')\n", + " \n", + " numeric_regex = r'^[-+]?\\d*\\.?\\d+$'\n", + " \n", + " regexs = {\n", + " 'CATCH_LOT': (r'^C\\d{3}[A-Z]$|^NOT_APPLICABLE$', \n", + " 'like C123A or NOT_APPLICABLE'),\n", + " 'DATE_OF_COLLECTION': (date_regex, 'in YYYY-MM-DD format'),\n", + " 'DECIMAL_LATITUDE': (r'^[-+]?([0-8]\\d|\\d)(\\.\\d+)?$', \n", + " 'between -90 and 90'),\n", + " 'DECIMAL_LONGITUDE': (r'^[-+]?(1[0-7]\\d|\\d\\d|\\d)(\\.\\d+)?$', \n", + " 'between -180 and 180'),\n", + " 'WHAT_3_WORDS': (r'^///[a-z]+\\.[a-z]+\\.[a-z]+$', \n", + " 'like ///one.two.three'),\n", + " 'TIME_OF_COLLECTION': (r'^(?:[01]\\d|2[0-3]):[0-5]\\d$|^(?:[01]\\d|2[0-3]):[0-5]\\d:[0-5]\\d$', \n", + " 'in HH:MM format'),\n", + " 'DURATION_OF_COLLECTION': (r'^P(?:\\d+Y)?(?:\\d+M)?(?:\\d+D)?(?:T(?:\\d+H)?(?:\\d+M)?(?:\\d+S)?)?$', \n", + " 'in P[n]Y[n]M[n]DT[n]H[n]M[n]S format'),\n", + " 'DATE_OF_PLATING': (date_regex, 'in YYYY-MM-DD format'),\n", + " 'DATE_OF_PRESERVATION': (date_regex, 'in YYYY-MM-DD format'),\n", + " 'ELEVATION': (numeric_regex, 'only a number (in metres)'),\n", + " 'DNA_EXTRACT_VOLUME_PROVIDED': (numeric_regex, 'only a number (in microlitres)'),\n", + " 'DNA_EXTRACT_CONCENTRATION': (numeric_regex, 'only a number (in nanograms per microlitre)')\n", + " }\n", " \n", - " is_valid_catch_lot = series.str.match(regex)\n", - " if not is_valid_catch_lot.all():\n", - " offending_values = series[~is_valid_catch_lot].unique()\n", - " s = index_ranges(series[~is_valid_catch_lot])\n", - " logging.error(f'{col} format incorrect for SERIES {s}: found {offending_values} - expected to be like C123A')\n", + " is_valid_regex = series.str.match(regexs[col][0])\n", + " if not is_valid_regex.all():\n", + " offending_values = list(series[~is_valid_regex].unique())\n", + " s = index_ranges(series[~is_valid_regex])\n", + " msg = (f'{col} format incorrect for SERIES {s}: found {offending_values} - '\n", + " f'expected to be {regexs[col][1]}')\n", + " if col == 'CATCH_LOT':\n", + " logging.warning(msg)\n", + " else:\n", + " logging.error(msg)\n", " " ] }, @@ -340,17 +453,13 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_plates_wells(df, contrib_df, plate_col='RACK_OR_PLATE_ID', well_col='TUBE_OR_WELL_ID', bioscan=False):\n", + "def validate_plates_wells(df, contrib_df, plate_col='RACK_OR_PLATE_ID', well_col='TUBE_OR_WELL_ID', \n", + " bioscan=False, bioscan_version='v3'):\n", " \n", " bioscan_partners_fn = '../data/bioscan_partners.tsv'\n", " \n", " # expect only complete 96-well plates\n", " logging.debug(f'validating {plate_col} and {well_col} columns')\n", - " \n", - " empty_rows = (df[plate_col] == '') | (df[well_col] == '')\n", - " if empty_rows.any():\n", - " logging.info(f'found and excluded {empty_rows.sum()} empty rows based on {plate_col} and {well_col}')\n", - " df = df.loc[~empty_rows]\n", " \n", " # plate names validation\n", " plates = df[plate_col].drop_duplicates()\n", @@ -369,11 +478,12 @@ " possible_partner_codes = plate_prefixes.value_counts().index.to_list()\n", " if len(possible_partner_codes) > 1:\n", " logging.error(f'only one plate ID prefix expected, found multiple: {possible_partner_codes}')\n", + " # selecting most frequent prefix as partner code\n", " selected_partner_code = possible_partner_codes[0]\n", " selected_partner_df = partners_df.query(f'partner_code == \"{selected_partner_code}\"')\n", " if selected_partner_df.shape[0] == 0:\n", - " logging.error(f'partner code {selected_partner_code} not found in {bioscan_partners_fn}, '\n", - " f'using \"Sanger Institute\" as default option')\n", + " logging.error(f'plate name prefix {selected_partner_code} not found in {bioscan_partners_fn} '\n", + " f'as partner code, using \"Sanger Institute\" as default partner')\n", " gal = \"Sanger Institute\"\n", " else:\n", " gal = selected_partner_df['gal'].iloc[0]\n", @@ -381,6 +491,7 @@ " else:\n", " # anospp\n", " unknown_prefixes = (~plate_prefixes.isin(contrib_df['PARTNER_CODE']))\n", + " selected_partner_code = 'SANG'\n", " gal = \"Sanger Institute\"\n", " if unknown_prefixes.any():\n", " logging.error(f'plate ID prefixes not recognised for {plates[unknown_prefixes].to_list()}')\n", @@ -389,15 +500,23 @@ " row_id = list('ABCDEFGH')\n", " col_id = range(1,13)\n", " expected_wells = [r + str(c) for (r,c) in itertools.product(row_id, col_id)]\n", + " \n", + " # different plate-only values across bioscan versions\n", + " if bioscan_version == 'v2':\n", + " plate_only_vals = ['PLATE_ONLY']\n", + " elif bioscan_version == 'v3':\n", + " plate_only_vals = ['PLATE_ONLY_1_BLANK','PLATE_ONLY_2_BLANKS']\n", + " else:\n", + " raise ValueError(f'unhandled bioscan version {bioscan_version} at validate_plates_wells()') \n", " \n", " for plate, pdf in df.groupby(plate_col):\n", " \n", - " if bioscan and (pdf[well_col] == 'PLATE_ONLY').any():\n", + " if bioscan and (pdf[well_col].isin(plate_only_vals)).any():\n", " logging.debug(f'skipping well validation for PLATE_ONLY plate {plate}')\n", " continue\n", " \n", " # check for well duplicates\n", - " dup_wells = pdf[well_col].duplicated()\n", + " dup_wells = pdf[well_col].duplicated()\n", " if dup_wells.any():\n", " logging.error(f'duplicate {well_col} for plate {plate}: {pdf.loc[dup_wells, well_col].unique()}')\n", " # check for non A1...H12 wells\n", @@ -416,7 +535,7 @@ " logging.info(f'{df.shape[0]} samples found across {df[plate_col].nunique()} plates')\n", " \n", "\n", - " return df, gal\n", + " return df, gal, selected_partner_code\n", " \n", "# df = validate_plates_wells(df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')" ] @@ -434,28 +553,38 @@ " col_id = range(1,13)\n", " expected_wells = [r + str(c) for (c,r) in itertools.product(col_id, row_id)]\n", " \n", - " if (df[well_col] == 'PLATE_ONLY').any():\n", + " blank_wells = {\n", + " 'PLATE_ONLY':['H12'], # v2\n", + " 'PLATE_ONLY_1_BLANK':['H12'], # v3\n", + " 'PLATE_ONLY_2_BLANKS':['G12','H12'], # v3\n", + " }\n", + " \n", + " if df[well_col].isin(blank_wells.keys()).any():\n", " pdfs = []\n", "\n", " for plate, pdf in df.groupby(plate_col):\n", " # If plate level only metadata is being entered\n", " # put “PLATE_ONLY” in well \n", " # and use only one row to capture the metadata for the whole plate\n", - " if (pdf[well_col] == 'PLATE_ONLY').any():\n", - " logging.info(f'found PLATE_ONLY plate {plate}, expanding to 96 well rows')\n", + " if pdf[well_col].isin(blank_wells.keys()).any():\n", + " \n", " if pdf.shape[0] > 1:\n", " logging.error(f'expected one row in PLATE_ONLY plate {plate}, found {pdf.shape[0]} - '\n", " f'extracting metadata from first row only')\n", + " plate_only_val = pdf[well_col].iloc[0]\n", + " logging.info(f'found {plate_only_val} plate {plate}, expanding to 96 well rows, '\n", + " f'keeping {\" and \".join(blank_wells[plate_only_val])} blank')\n", " # expand to 96 rows\n", " pdf = pd.DataFrame(pdf.iloc[0] for i in range(len(expected_wells)))\n", " pdf[well_col] = expected_wells\n", - " # H12 blank\n", - " for col in pdf.columns:\n", - " if col == 'ORGANISM_PART':\n", - " pdf[col].iloc[-1]='NOT_APPLICABLE'\n", - " elif col not in ['SERIES','CATCH_LOT','RACK_OR_PLATE_ID',\n", - " 'TUBE_OR_WELL_ID','ORGANISM_PART','PRESERVATIVE_SOLUTION']:\n", - " pdf[col].iloc[-1]=''\n", + " \n", + " for blank_well in blank_wells[plate_only_val]:\n", + " for col in pdf.columns:\n", + " if col == 'ORGANISM_PART':\n", + " pdf.loc[pdf[well_col] == blank_well, col] = 'NOT_APPLICABLE'\n", + " elif col not in ['SERIES','CATCH_LOT','RACK_OR_PLATE_ID',\n", + " 'TUBE_OR_WELL_ID','ORGANISM_PART','PRESERVATIVE_SOLUTION']:\n", + " pdf.loc[pdf[well_col] == blank_well, col] = ''\n", " pdfs.append(pdf)\n", " \n", " df = pd.concat(pdfs).reset_index(drop=True)\n", @@ -476,7 +605,6 @@ "metadata": {}, "outputs": [], "source": [ - "## TODO - which columns require NA, do not remove blanks to be able to get taxids for all\n", "def check_blanks(df):\n", " \n", " logging.debug('checking and excluding blank samples based on ORGANISM_PART being NOT_APPLICABLE') \n", @@ -500,7 +628,8 @@ " \n", " for col in df.columns:\n", " if col not in ['SERIES','CATCH_LOT','RACK_OR_PLATE_ID',\n", - " 'TUBE_OR_WELL_ID','ORGANISM_PART','PRESERVATIVE_SOLUTION']:\n", + " 'TUBE_OR_WELL_ID','ORGANISM_PART','PRESERVATIVE_SOLUTION',\n", + " 'OTHER_INFORMATION','MISC_METADATA']:\n", " excessive_blank_info_df = blank_df[blank_df[col] != '']\n", " if excessive_blank_info_df.shape[0] > 0:\n", " logging.error(f'{col} column values non-empty for blank samples '\n", @@ -508,7 +637,7 @@ " \n", " \n", " logging.info(f'{is_blank.sum()} blanks located across {df.RACK_OR_PLATE_ID.nunique()} plates, '\n", - " f'{non_blank_df.shape[0]} samples of {df.shape[0]} left for downstream analysis')\n", + " f'{non_blank_df.shape[0]} non-blank samples will be further validated')\n", " \n", " \n", " return is_blank\n", @@ -563,6 +692,7 @@ " logging.warning(msg)\n", " elif level == 'e':\n", " logging.error(msg)\n", + " \n", "# else:\n", "# logging.info('all values valid in {!r}'.format(col))\n", " \n", @@ -575,99 +705,17 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_date(col, df, na_values=[]):\n", - " \n", - " # TODO allow for YYYY-MM, YYYY, harmonise code with hooks implementation (regex)\n", - " \n", - " logging.debug(f'validating date format in {col} column')\n", - "\n", - " if col not in df.columns:\n", - " logging.error(f'{col} column not found in manifest')\n", - " return\n", - " series = df[col]\n", - " series = exclude_missing(series, na_values)\n", - " \n", - " # invalid date formats\n", - " # empty string converted to NaT\n", - " date_series = pd.to_datetime(series, format='%Y-%m-%d', errors='coerce')\n", - " if date_series.isna().any():\n", - " logging.error(f'invalid date format in {col} column, SERIES {index_ranges(date_series[date_series.isna()])}: '\n", - " f'{series[date_series.isna()].unique()} - expected YYYY-MM-DD format')\n", - " valid_date_series = date_series[~date_series.isna()]\n", - " \n", - " # dates in future\n", - " future_dates = (valid_date_series > datetime.datetime.today())\n", - " if future_dates.any():\n", - " logging.error(f'future dates in {col} column, SERIES {index_ranges(valid_date_series[future_dates])}: '\n", - " f'{valid_date_series[future_dates].unique()}')\n", - " \n", - " # dates too old\n", - " old_dates = (valid_date_series < datetime.datetime.strptime('1900-01-01', '%Y-%m-%d'))\n", - " if old_dates.any():\n", - " logging.error(f'pre-1900 dates in {col} column, SERIES {index_ranges(valid_date_series[old_dates])}: '\n", - " f'{valid_date_series[old_dates].unique()}')\n", - " \n", - " return valid_date_series\n", - "\n", - "# validate_date('DATE_OF_COLLECTION', df, na_values=[])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# bd = validate_date('DATE_OF_COLLECTION', df)\n", - "# ad = validate_date('DATE_OF_PRESERVATION', df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compare_dates(before, after):\n", - " '''\n", - " comparing series generated by validate_date, harmonise code with hooks implementation (regex)\n", - " ''' \n", - " \n", - " logging.debug(f'checking that {before.name} is earlier than {after.name}')\n", + "def strip_asterisks(col, df):\n", "\n", - " ctdf = pd.concat([before.reset_index(), after.reset_index(drop=True)], axis=1).set_index('SERIES')\n", - " date_conflict = ctdf[before.name] > ctdf[after.name]\n", - " \n", - " if date_conflict.any():\n", - " logging.error(f'{before.name} column values are later than {after.name} column values in SERIES '\n", - " f'{index_ranges(ctdf[date_conflict])}')\n", - "\n", - " \n", - "# compare_dates(bd, ad)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def validate_wtw(col, df, na_values=[]):\n", - " \n", - " logging.debug(f'validating what3words format in {col} column')\n", - " \n", - " if col not in df.columns:\n", - " logging.error(f'{col} column not found in manifest')\n", - " return\n", " series = df[col]\n", - " series = exclude_missing(series, na_values)\n", - " \n", - " valid_wtw_series = series.str.match('///[a-z]+\\.[a-z]+\\.[a-z]+')\n", - " \n", - " if not valid_wtw_series.all():\n", - " invalid_wtw_series = series[~valid_wtw_series]\n", - " logging.error(f'invalid what3words format in {col}, SERIES {index_ranges(invalid_wtw_series)}: '\n", - " f'{invalid_wtw_series.unique()} - expected to be like \"///one.two.three\"')" + " asterisk_to_strip = (series.str.startswith(\"*\") & series.str.endswith(\"*\"))\n", + " if asterisk_to_strip.any():\n", + " asterisk_series = index_ranges(series[asterisk_to_strip])\n", + " logging.info(f'{col} column, SERIES {asterisk_series}: '\n", + " f'stripping asterisks from {series[asterisk_to_strip].unique()} for output')\n", + " series = series.str.strip('*')\n", + " df[col] = series\n", + " return df" ] }, { @@ -676,84 +724,24 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_time(col, df, na_values=[], bioscan=False):\n", - " \n", - " # TODO harmonise code with hooks implementation (regex)\n", + "def compare_dates_text(before_col, after_col, df):\n", " \n", - " logging.debug(f'validating time format in {col} column')\n", + " logging.debug(f'checking that {before_col} is earlier than {after_col}')\n", " \n", - " if col not in df.columns:\n", - " logging.error(f'{col} column not found in manifest')\n", - " return\n", - " series = df[col]\n", - " series = exclude_missing(series, na_values)\n", - " \n", - " # invalid time formats\n", - " # bioscan sheet format yields HH:MM:SS instead of HH:MM\n", - " time_format = '%H:%M:%S' if bioscan else '%H:%M'\n", - " # NB empty string converted to NaT\n", - " time_series = pd.to_datetime(series, format=time_format, errors='coerce')\n", - " if time_series.isna().any():\n", - " logging.error(\n", - " f'invalid time format in {col} column, SERIES {index_ranges(time_series[time_series.isna()])}: '\n", - " f'{series[time_series.isna()].unique()} - expected HH:MM'\n", - " )\n", - " valid_time_series = time_series[~time_series.isna()]\n", + " for col in (before_col, after_col):\n", + " if col not in df.columns:\n", + " logging.error(f'{col} column not found in manifest')\n", + " return\n", + " # invalid date formats or empty string converted to NaT\n", + " before_series = pd.to_datetime(df[before_col], format='%Y-%m-%d', errors='coerce').copy()\n", + " after_series = pd.to_datetime(df[after_col], format='%Y-%m-%d', errors='coerce').copy()\n", " \n", - " return valid_time_series\n", - "\n", - "# validate_time('TIME_OF_COLLECTION', df, na_values=[''])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def validate_time_period(col, df, na_values=[]):\n", + " date_conflict = before_series > after_series\n", " \n", - " logging.debug(f'validating time period format in {col} column')\n", + " s = index_ranges(df[date_conflict])\n", " \n", - " if col not in df.columns:\n", - " logging.error(f'{col} column not found in manifest')\n", - " return\n", - " series = df[col]\n", - " series = exclude_missing(series, na_values)\n", - "\n", - " # conversion with modifications for proper parsing \n", - " # by pd.Timedelta (does not accept missing data, e.g. 'PT1H')\n", - " # note - will not work for weeks and months\n", - " def convert_iso_duration(s):\n", - " if s == np.nan:\n", - " return np.nan\n", - " if not s.startswith('P') or 'T' not in s:\n", - " return np.nan\n", - " # add days\n", - " if s.startswith('PT'):\n", - " s = s.replace('PT','P0DT')\n", - " # add trailing minutes and seconds\n", - " if s.endswith('H'):\n", - " s += '0M0S'\n", - " elif s.endswith('M'):\n", - " s += '0S'\n", - " try:\n", - " return pd.Timedelta(s)\n", - " except:\n", - " return np.nan\n", - " time_period_series = series.apply(convert_iso_duration)\n", - " if time_period_series.isna().any():\n", - " logging.error(\n", - " f'invalid time period format in {col} column, '\n", - " f'SERIES {index_ranges(time_period_series[time_period_series.isna()])}: '\n", - " f'{series[time_period_series.isna()].unique()}'\n", - " )\n", - " valid_time_period_series = time_period_series[~time_period_series.isna()]\n", - " return valid_time_period_series\n", - "\n", - "# df.loc[1,'DURATION_OF_COLLECTION'] = 'PVT1H'\n", - "# validate_time_period('DURATION_OF_COLLECTION', df, na_values=['']);\n", - "# df['DURATION_OF_COLLECTION']" + " if date_conflict.any():\n", + " logging.error(f'{before_col} column values are later than {after_col} column values in SERIES {s}')" ] }, { @@ -762,7 +750,6 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO include tests\n", "def validate_country_and_coordinates(df, fn, na_values=[], bioscan=False):\n", " \n", " logging.debug('validating COUNTRY_OF_COLLECTION against DECIMAL_LATITUDE and DECIMAL_LONGITUDE')\n", @@ -904,8 +891,8 @@ " }\n", " \n", " expected_ranks = {\n", - " 'order':('order','class'),\n", - " 'family':('family','superfamily','subfamily','tribe','subtribe'),\n", + " 'order':('class','subclass','order'),\n", + " 'family':('suborder','superfamily','family','subfamily','tribe','subtribe'),\n", " 'genus':('genus','subgenus'),\n", " 'species':('species')\n", " }\n", @@ -948,7 +935,7 @@ " unmatched_names = set(tax_names) - set(tax_info[tax_level].keys())\n", " if len(unmatched_names) > 0:\n", " for tname in unmatched_names:\n", - " s = index_ranges(df[df[tax_col].str.match(f'^{tname}$', case=False)])\n", + " s = index_ranges(df[df[tax_col].str.match(f'^{re.escape(tname)}$', case=False)])\n", " if tax_level == 'species' and anospp:\n", " if tname in harbach_spp:\n", " logging.warning(f'{tax_col} column, SERIES {s}:'\n", @@ -967,7 +954,7 @@ " \n", " upd_tid = tids[0]\n", " \n", - " s = index_ranges(df[df[tax_col].str.match(f'^{tname}$', case=False)])\n", + " s = index_ranges(df[df[tax_col].str.match(f'^{re.escape(tname)}$', case=False)])\n", " \n", " if len(tids) == 1:\n", " if ranks[upd_tid] not in expected_ranks[tax_level]: \n", @@ -975,10 +962,10 @@ " f'(taxid {upd_tid}): \"{ranks[upd_tid]}\"')\n", " \n", " if len(tids) > 1: \n", - " for tid, r in ranks.items():\n", - " if r == expected_rank and len(tids) > 1:\n", + " for tid, rank in ranks.items():\n", + " if rank in expected_ranks[tax_level] and len(tids) > 1:\n", " logging.debug(f'{tax_col} column, SERIES {s}: using only first matching rank '\n", - " f'for \"{tname}\" (taxid {tid}): \"{r}\"')\n", + " f'for \"{tname}\" (taxid {tid}): \"{rank}\"')\n", " upd_tid = tid\n", " break\n", " else:\n", @@ -1008,7 +995,7 @@ " \n", " family_lineage = ncbi.get_lineage(family_id)\n", " \n", - " s = index_ranges(df[df['PREDICTED_FAMILY'].str.match(f'^{r.family}$', case=False)])\n", + " s = index_ranges(df[df['PREDICTED_FAMILY'].str.match(f'^{re.escape(r.family)}$', case=False)])\n", " \n", " if order_id not in family_lineage:\n", " logging.error(f'PREDICTED_FAMILY column, SERIES {s}: family \"{r.family}\" (taxid {family_id}) '\n", @@ -1025,7 +1012,7 @@ " \n", " genus_lineage = ncbi.get_lineage(genus_id)\n", " \n", - " s = index_ranges(df[df['PREDICTED_GENUS'].str.match(f'^{r.genus}$', case=False)])\n", + " s = index_ranges(df[df['PREDICTED_GENUS'].str.match(f'^{re.escape(r.genus)}$', case=False)])\n", " \n", " if order_id not in genus_lineage:\n", " logging.error(\n", @@ -1047,7 +1034,7 @@ " \n", " species_lineage = ncbi.get_lineage(species_id)\n", " \n", - " s = index_ranges(df[df['PREDICTED_SCIENTIFIC_NAME'].str.match(f'^{r.species}$', case=False)])\n", + " s = index_ranges(df[df['PREDICTED_SCIENTIFIC_NAME'].str.match(f'^{re.escape(r.species)}$', case=False)])\n", " \n", " if order_id not in species_lineage:\n", " logging.error(\n", @@ -1115,9 +1102,7 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_float(col, df, na_values=[]):\n", - " \n", - " logging.debug(f'validating float number format in {col} column')\n", + "def validate_identifier(col, df, contrib_df, na_values=['']):\n", " \n", " if col not in df.columns:\n", " logging.error(f'{col} column not found in manifest')\n", @@ -1125,18 +1110,14 @@ " series = df[col]\n", " series = exclude_missing(series, na_values)\n", " \n", - " for val in series.unique():\n", - " try:\n", - " float(val)\n", - " except:\n", - " s = index_ranges(df.query(f'{col} == \"{val}\"'))\n", - " logging.error(\n", - " f'{col} column, SERIES {s}: '\n", - " f'found non-numeric value \"{val}\"'\n", - " )\n", - " \n", + " expected_people = contrib_df['FULL_NAME'].to_list()\n", + " \n", + " for person in series.unique():\n", + " if not (person in expected_people):\n", + " s = index_ranges(series[series == person])\n", " \n", - "# validate_float('ELEVATION', df, na_values=[''])" + " logging.warning(f'{col} column, SERIES {s}: \"{person}\" not found in '\n", + " f'contributors sheet among {expected_people}')" ] }, { @@ -1172,7 +1153,28 @@ "metadata": {}, "outputs": [], "source": [ - "def add_sts_cols(df, contrib_df, gal, bioscan=True):\n", + "def validate_input_filename(input_fn, partner_code, bioscan_version):\n", + " \n", + " # ABCD_YYMM_\n", + " v = bioscan_version.strip('v')\n", + " fn_regex = f'^{partner_code}_(2[0-3])(0[1-9]|1[0-2])_BIOSCAN_Manifest_V{v}.*xlsx$'\n", + " \n", + " fn_basename = os.path.basename(input_fn)\n", + " \n", + " if not re.match(fn_regex, fn_basename):\n", + " logging.warning(f'input filename {fn_basename} does not match '\n", + " f'{partner_code}_YYMM_BIOSCAN_Manifest_V{v}*xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def add_sts_cols(df, contrib_df, gal, bioscan=True, v='v2'):\n", + " \n", + " logging.debug('adding STS columns to manifest')\n", " \n", " is_blank = (df['ORGANISM_PART'] == 'NOT_APPLICABLE')\n", " \n", @@ -1188,7 +1190,16 @@ " df['SYMBIONT'] = 'TARGET'\n", " df['REGULATORY_COMPLIANCE'] = 'Y'\n", " df['HAZARD_GROUP'] = 'HG1'\n", - " df['CATCH_SOLUTION'] = '100%_ETHANOL'\n", + " if bioscan and v == 'v2':\n", + " logging.info('auto-filling CATCH_SOLUTION as 100%_ETHANOL and AMOUNT_OF_CATCH_PLATED as '\n", + " 'ALL_SPECIMENS_PLATED columns for bioscan manifest v2')\n", + " df['CATCH_SOLUTION'] = '100%_ETHANOL'\n", + " df['AMOUNT_OF_CATCH_PLATED'] = 'ALL_SPECIMENS_PLATED'\n", + " if (df['IDENTIFIER_AFFILIATION'] != '').any():\n", + " logging.warning('IDENTIFIER_AFFILIATION has some data filled in - '\n", + " 'note this column will be removed from output')\n", + " logging.info('dropping IDENTIFIER_AFFILIATION column for bioscan manifest v2')\n", + " df = df.drop(columns=['IDENTIFIER_AFFILIATION'])\n", " # add contributors - delimiters checked in validate_contributors\n", " contrib_series = contrib_df['FULL_NAME'] + ';' + \\\n", " contrib_df['PRIMARY_AFFILIATION'] + ';' + \\\n", @@ -1196,6 +1207,12 @@ " contrib_df['CONTRIBUTION']\n", " df['CONTRIBUTORS'] = '|'.join(list(contrib_series))\n", " \n", + " if 'MISC_METADATA' in df.columns:\n", + " logging.info('dropping MISC_METADATA column')\n", + " if (df['MISC_METADATA'] != '').any():\n", + " logging.warning('MISC_METADATA has some data filled in - note this column will be removed from output')\n", + " df = df.drop(columns=['MISC_METADATA'])\n", + " \n", " return df\n", "\n", "# add_sts_cols(df, contrib_df, gal='Sanger Institute');" @@ -1210,8 +1227,8 @@ "def write_sts_manifest(df, input_fn, validation_version):\n", " \n", " output_fn = input_fn.rstrip('.xlsx') + '_' + validation_version + '_for_sts.xlsx'\n", - " \n", - " logging.info(f'writing STS manifest to {output_fn}')\n", + " \n", + " logging.info(f'writing STS manifest to \"{output_fn}\"')\n", " \n", " df.to_excel(output_fn, sheet_name='Metadata Entry')\n", " "