Skip to content

Commit

Permalink
fixes upon test run on backlog, bump version, changelog
Browse files Browse the repository at this point in the history
  • Loading branch information
amakunin committed Aug 18, 2023
1 parent 6eb7f13 commit b005d94
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 27 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Changelog

## 0.3.0

- add bioscan manifest v3 support and set it as template, v2 input still supported
- amend STS output to match v3 manifest, allowing for both v2 and v3 manifests conversion into the same STS output
- regex validations for all format-sensitive fields including dates, reimplement date comparison
- date cleanup from time addition by Excel-to-pandas conversion
- validate identifier against contributors
- validate input filename
- continued error message clarifications
- logic updates, bug fixes, general refactoring
- tested on backlog pre-release

## 0.2.0

Expand Down
Binary file modified data/BIOSCAN_Manifest_V3_20230818.xlsx
Binary file not shown.
4 changes: 2 additions & 2 deletions work/validate_anospp.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
" setup_logging(verbose=verbose)\n",
"\n",
" logging.info(f'# partner manifest validation v.{VALIDATION_VERSION}')\n",
" logging.info(f'# validating ANOSPP manifest v.{ANOSPP_VERSION}')\n",
" logging.info(f'# validating against ANOSPP manifest v.{ANOSPP_VERSION}')\n",
" logging.info(f'# manifest \"{fn}\"')\n",
"\n",
" # read data\n",
Expand Down Expand Up @@ -117,7 +117,7 @@
" if write_sts:\n",
" write_sts_manifest(df, fn, VALIDATION_VERSION)\n",
"\n",
" logging.info('# ended validate_anospp_partner_manifest_v.{}'.format(ANOSPP_VERSION))\n",
" logging.info('# ended validate_anospp_partner_manifest')\n",
" \n",
" return df\n",
"\n",
Expand Down
14 changes: 8 additions & 6 deletions work/validate_bioscan.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
" setup_logging(verbose=verbose)\n",
"\n",
" logging.info(f'# partner manifest validation v.{VALIDATION_VERSION}')\n",
" logging.info(f'# validating BIOSCAN manifest v.{BIOSCAN_VERSION}')\n",
" logging.info(f'# validating against BIOSCAN manifest v.{BIOSCAN_VERSION}')\n",
" logging.info(f'# manifest \"{fn}\"')\n",
"\n",
" # read data\n",
Expand All @@ -54,7 +54,7 @@
" # orange cols\n",
" validate_regex('CATCH_LOT', df, na_values=[])\n",
" df, gal, partner_code = validate_plates_wells(\n",
" df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True)\n",
" df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True, bioscan_version=v)\n",
" # check and exclude blanks\n",
" is_blank = check_blanks(df)\n",
" if df[~is_blank].shape[0] == 0:\n",
Expand Down Expand Up @@ -107,14 +107,16 @@
" validate_freetext('OTHER_INFORMATION', df)\n",
" validate_freetext('MISC_METADATA', df)\n",
" validate_identifier('IDENTIFIED_BY', df, contrib_df, na_values=[''])\n",
" validate_input_filename(fn, partner_code, v)\n",
" \n",
" df = expand_plate_only(df)\n",
" df = add_sts_cols(df, contrib_df, gal, bioscan=True, v=v)\n",
" \n",
" # keep filename operations together\n",
" validate_input_filename(fn, partner_code, v)\n",
" if write_sts:\n",
" write_sts_manifest(df, fn, VALIDATION_VERSION)\n",
" \n",
" logging.info('# ended validation of bioscan partner manifest v.{}'.format(BIOSCAN_VERSION))\n",
" logging.info('# ended validation of bioscan partner manifest')\n",
" \n",
" print('\\n'.join(df.RACK_OR_PLATE_ID.unique()))\n",
"\n",
Expand Down Expand Up @@ -161,14 +163,14 @@
"metadata": {},
"outputs": [],
"source": [
"fn = '../results/20230802_js_batch4/NBGW-2023-Manifest.xlsx'\n",
"fn = '../results/20230817_js_batch8/NBGW-2023-Manifest-2023-08-14.xlsx'\n",
"df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ae435ce",
"id": "6a82cc40",
"metadata": {},
"outputs": [],
"source": []
Expand Down
34 changes: 15 additions & 19 deletions work/validate_partner_manifest_dev.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@
"metadata": {},
"outputs": [],
"source": [
"VALIDATION_VERSION = '0.2.0dev'\n",
"VALIDATION_VERSION = '0.3.0'\n",
"ANOSPP_VERSION = '4.0'\n",
"BIOSCAN_VERSION = '3.0'"
"# V2.0, but V3 in SOP\n",
"BIOSCAN_VERSION = '3'"
]
},
{
Expand Down Expand Up @@ -414,7 +415,7 @@
" \n",
" regexs = {\n",
" 'CATCH_LOT': (r'^C\\d{3}[A-Z]$|^NOT_APPLICABLE$', \n",
" 'like C123A'),\n",
" 'like C123A or NOT_APPLICABLE'),\n",
" 'DATE_OF_COLLECTION': (date_regex, 'in YYYY-MM-DD format'),\n",
" 'DECIMAL_LATITUDE': (r'^[-+]?([0-8]\\d|\\d)(\\.\\d+)?$', \n",
" 'between -90 and 90'),\n",
Expand All @@ -429,8 +430,8 @@
" 'DATE_OF_PLATING': (date_regex, 'in YYYY-MM-DD format'),\n",
" 'DATE_OF_PRESERVATION': (date_regex, 'in YYYY-MM-DD format'),\n",
" 'ELEVATION': (numeric_regex, 'only a number (in metres)'),\n",
" 'DNA_EXTRACT_VOLUME_PROVIDED': (numeric_regex, 'a number (in microlitres)'),\n",
" 'DNA_EXTRACT_CONCENTRATION': (numeric_regex, 'a number (in nanograms per microlitre)')\n",
" 'DNA_EXTRACT_VOLUME_PROVIDED': (numeric_regex, 'only a number (in microlitres)'),\n",
" 'DNA_EXTRACT_CONCENTRATION': (numeric_regex, 'only a number (in nanograms per microlitre)')\n",
" }\n",
" \n",
" is_valid_regex = series.str.match(regexs[col][0])\n",
Expand Down Expand Up @@ -477,11 +478,12 @@
" possible_partner_codes = plate_prefixes.value_counts().index.to_list()\n",
" if len(possible_partner_codes) > 1:\n",
" logging.error(f'only one plate ID prefix expected, found multiple: {possible_partner_codes}')\n",
" selected_partner_code = possible_partner_codes[-1]\n",
" # selecting most frequent prefix as partner code\n",
" selected_partner_code = possible_partner_codes[0]\n",
" selected_partner_df = partners_df.query(f'partner_code == \"{selected_partner_code}\"')\n",
" if selected_partner_df.shape[0] == 0:\n",
" logging.error(f'partner code {selected_partner_code} not found in {bioscan_partners_fn}, '\n",
" f'using \"Sanger Institute\" as default partner')\n",
" logging.error(f'plate name prefix {selected_partner_code} not found in {bioscan_partners_fn} '\n",
" f'as partner code, using \"Sanger Institute\" as default partner')\n",
" gal = \"Sanger Institute\"\n",
" else:\n",
" gal = selected_partner_df['gal'].iloc[0]\n",
Expand Down Expand Up @@ -552,7 +554,7 @@
" expected_wells = [r + str(c) for (c,r) in itertools.product(col_id, row_id)]\n",
" \n",
" blank_wells = {\n",
" 'PLATE_ONLY':['G12','H12'], # v2\n",
" 'PLATE_ONLY':['H12'], # v2\n",
" 'PLATE_ONLY_1_BLANK':['H12'], # v3\n",
" 'PLATE_ONLY_2_BLANKS':['G12','H12'], # v3\n",
" }\n",
Expand Down Expand Up @@ -709,8 +711,8 @@
" asterisk_to_strip = (series.str.startswith(\"*\") & series.str.endswith(\"*\"))\n",
" if asterisk_to_strip.any():\n",
" asterisk_series = index_ranges(series[asterisk_to_strip])\n",
" logging.warning(f'{col} column, SERIES {asterisk_series}: '\n",
" f'stripping asterisks from {series[asterisk_to_strip].unique()}')\n",
" logging.info(f'{col} column, SERIES {asterisk_series}: '\n",
" f'stripping asterisks from {series[asterisk_to_strip].unique()} for output')\n",
" series = series.str.strip('*')\n",
" df[col] = series\n",
" return df"
Expand Down Expand Up @@ -1189,7 +1191,8 @@
" df['REGULATORY_COMPLIANCE'] = 'Y'\n",
" df['HAZARD_GROUP'] = 'HG1'\n",
" if bioscan and v == 'v2':\n",
" logging.info('auto-filling CATCH_SOLUTION and AMOUNT_OF_CATCH_PLATED columns for bioscan manifest v2')\n",
" logging.info('auto-filling CATCH_SOLUTION as 100%_ETHANOL and AMOUNT_OF_CATCH_PLATED as '\n",
" 'ALL_SPECIMENS_PLATED columns for bioscan manifest v2')\n",
" df['CATCH_SOLUTION'] = '100%_ETHANOL'\n",
" df['AMOUNT_OF_CATCH_PLATED'] = 'ALL_SPECIMENS_PLATED'\n",
" if (df['IDENTIFIER_AFFILIATION'] != '').any():\n",
Expand All @@ -1215,13 +1218,6 @@
"# add_sts_cols(df, contrib_df, gal='Sanger Institute');"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down

0 comments on commit b005d94

Please sign in to comment.