Skip to content

Commit

Permalink
Merge pull request #6 from amakunin/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
amakunin authored Aug 18, 2023
2 parents 888fd2c + b005d94 commit 0326f77
Show file tree
Hide file tree
Showing 7 changed files with 340 additions and 281 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Changelog

## 0.3.0

- add bioscan manifest v3 support and set it as template, v2 input still supported
- amend STS output to match v3 manifest, allowing for both v2 and v3 manifests conversion into the same STS output
- regex validations for all format-sensitive fields including dates, reimplement date comparison
- date cleanup from time addition by Excel-to-pandas conversion
- validate identifier against contributors
- validate input filename
- continued error message clarifications
- logic updates, bug fixes, general refactoring
- tested on backlog pre-release

## 0.2.0

Expand Down
Binary file added data/BIOSCAN_Manifest_SOP_V3_20230818.docx
Binary file not shown.
Binary file modified data/BIOSCAN_Manifest_V2.0_20230727.xlsx
Binary file not shown.
Binary file added data/BIOSCAN_Manifest_V3_20230818.xlsx
Binary file not shown.
50 changes: 29 additions & 21 deletions work/validate_anospp.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,21 @@
"source": [
"def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx', \n",
" verbose=False, samples_sheet='TAB 2 Metadata Entry',\n",
" contrib_sheet='TAB 1 Contributors'):\n",
" contrib_sheet='TAB 1 Contributors', write_sts=True):\n",
" '''\n",
" ANOSPP partner manifest validation\n",
" Validation follows the order of columns order in data entry sheet\n",
" '''\n",
"\n",
" setup_logging(verbose=verbose)\n",
"\n",
" logging.warning(f'# partner manifest validation v.{VALIDATION_VERSION}')\n",
" logging.warning(f'# validating ANOSPP manifest v.{ANOSPP_VERSION}')\n",
" logging.warning(f'# manifest {fn}')\n",
" logging.info(f'# partner manifest validation v.{VALIDATION_VERSION}')\n",
" logging.info(f'# validating against ANOSPP manifest v.{ANOSPP_VERSION}')\n",
" logging.info(f'# manifest \"{fn}\"')\n",
"\n",
" # read data\n",
" df = get_data(fn, sheet=samples_sheet)\n",
" df = fix_date_formats(df)\n",
" # validate series, exclude non-numeric\n",
" df = validate_series(df)\n",
" # clean up data\n",
Expand Down Expand Up @@ -69,9 +70,9 @@
" validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)\n",
" validate_values('ORGANISM_PART', df, valid_dict, sep='|')\n",
" # columns below validated for non-blank samples only\n",
" date_coll = validate_date('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" validate_float('DECIMAL_LATITUDE', df[~is_blank])\n",
" validate_float('DECIMAL_LONGITUDE', df[~is_blank])\n",
" validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" validate_regex('DECIMAL_LATITUDE', df[~is_blank])\n",
" validate_regex('DECIMAL_LONGITUDE', df[~is_blank])\n",
" # COLLECTION_COUNTRY, DECIMAL_LATITUDE, DECIMAL_LONGITUDE\n",
" validate_country_and_coordinates(df[~is_blank], fn, na_values=[''])\n",
" # COLLECTION_LOCATION not checked\n",
Expand All @@ -84,8 +85,8 @@
" validate_freetext('IDENTIFIED_HOW', df[~is_blank])\n",
" validate_values('LIFESTAGE', df[~is_blank], valid_dict)\n",
" validate_values('SEX', df[~is_blank], valid_dict, na_values = [''])\n",
" validate_time('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" validate_time_period('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)\n",
" validate_values('OUTDOORS_INDOORS', df[~is_blank], valid_dict, na_values = [''])\n",
" validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df[~is_blank])\n",
Expand All @@ -95,22 +96,29 @@
" validate_values('BLOOD_MEAL', df, valid_dict, na_values=[''])\n",
" validate_values('GRAVIDITY', df, valid_dict, na_values=[''])\n",
" validate_freetext('HABITAT', df)\n",
" date_pres = validate_date('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION\n",
" compare_dates(before=date_coll, after=date_pres)\n",
" validate_float('ELEVATION', df, na_values=[''])\n",
" validate_wtw('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n",
" validate_regex('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION\n",
" compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PRESERVATION', df[~is_blank])\n",
" validate_regex('ELEVATION', df, na_values=[''])\n",
" validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n",
" validate_freetext('OTHER_ORGANISMS', df)\n",
" validate_freetext('BIOASSAYS', df)\n",
" validate_freetext('COLLECTOR_SAMPLE_ID', df)\n",
" validate_freetext('OTHER_INFORMATION', df)\n",
" validate_freetext('MISC_METADATA', df)\n",
" # MISC_METADATA can be removed safely\n",
" if 'MISC_METADATA' in df.columns:\n",
" validate_freetext('MISC_METADATA', df)\n",
" validate_freetext('DNA_EXTRACTION_DESCRIPTION', df)\n",
" validate_float('DNA_EXTRACT_VOLUME_PROVIDED', df, na_values=[''])\n",
" validate_float('DNA_EXTRACT_CONCENTRATION', df, na_values=[''])\n",
" validate_regex('DNA_EXTRACT_VOLUME_PROVIDED', df, na_values=[''])\n",
" validate_regex('DNA_EXTRACT_CONCENTRATION', df, na_values=[''])\n",
" \n",
" logging.warning('# ended validate_anospp_partner_manifest_v.{}'.format(ANOSPP_VERSION))\n",
" \n",
" \n",
" df = add_sts_cols(df, contrib_df, gal, bioscan=False, v='NA')\n",
" if write_sts:\n",
" write_sts_manifest(df, fn, VALIDATION_VERSION)\n",
"\n",
" # TODO yield table ready for STS submission\n",
" logging.info('# ended validate_anospp_partner_manifest')\n",
" \n",
" return df\n",
"\n",
"fn = '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx'\n",
Expand Down Expand Up @@ -152,14 +160,14 @@
"metadata": {},
"outputs": [],
"source": [
"fn = '../results/20230725_aydi_akje_re/Manifest-MADAGASCAR 2023 II.xlsx'\n",
"df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributor')"
"fn = '../results/20230814_olaitan_2_re_re/Anopheles_Metadata_Manifest_V4.0_EKUW - 14.08.2023 - antivec samples.xlsx'\n",
"df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0ff01236",
"id": "520423bd",
"metadata": {},
"outputs": [],
"source": []
Expand Down
79 changes: 51 additions & 28 deletions work/validate_bioscan.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"metadata": {},
"outputs": [],
"source": [
"def validate_bioscan(fn, template_fn='../data/BIOSCAN_Manifest_V2.0_20230727.xlsx', \n",
"def validate_bioscan(fn, template_fn='../data/BIOSCAN_Manifest_V3_20230818.xlsx', \n",
" samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', \n",
" verbose=False, write_sts=True):\n",
" '''\n",
Expand All @@ -27,12 +27,15 @@
"\n",
" setup_logging(verbose=verbose)\n",
"\n",
" logging.warning(f'# partner manifest validation v.{VALIDATION_VERSION}')\n",
" logging.warning(f'# validating BIOSCAN manifest v.{BIOSCAN_VERSION}')\n",
" logging.warning(f'# manifest {fn}')\n",
" logging.info(f'# partner manifest validation v.{VALIDATION_VERSION}')\n",
" logging.info(f'# validating against BIOSCAN manifest v.{BIOSCAN_VERSION}')\n",
" logging.info(f'# manifest \"{fn}\"')\n",
"\n",
" # read data\n",
" df = get_data(fn, sheet=samples_sheet)\n",
" df = fix_date_formats(df)\n",
" # check manifest verison\n",
" v = infer_bioscan_version(df)\n",
" # check series, exclude non-numeric\n",
" df = validate_series(df)\n",
" # clean up data\n",
Expand All @@ -44,36 +47,43 @@
" \n",
" # prepare for validation\n",
" template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')\n",
" check_columns(df, template_df)\n",
" check_columns(df, template_df, bioscan_version=v)\n",
" valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4 DO NOT EDIT - Data Valida')\n",
" contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)\n",
"\n",
" # orange cols\n",
" validate_catch_lot('CATCH_LOT', df, na_values=[''])\n",
" df, gal = validate_plates_wells(df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True)\n",
" validate_regex('CATCH_LOT', df, na_values=[])\n",
" df, gal, partner_code = validate_plates_wells(\n",
" df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True, bioscan_version=v)\n",
" # check and exclude blanks\n",
" is_blank = check_blanks(df)\n",
" if df[~is_blank].shape[0] == 0:\n",
" logging.error('no non-blank samples to validate, terminating')\n",
" return df\n",
" validate_values('ORGANISM_PART', df, valid_dict, sep='|')\n",
" df = strip_asterisks('ORGANISM_PART', df)\n",
" validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)\n",
" df = strip_asterisks('PRESERVATIVE_SOLUTION', df)\n",
" # columns below validated for non-blank samples only\n",
" if v == 'v3':\n",
" validate_values('CATCH_SOLUTION', df[~is_blank], valid_dict)\n",
" df = strip_asterisks('CATCH_SOLUTION', df)\n",
" validate_values('BOTTLE_DIRECTION', df[~is_blank], valid_dict) # TODO allow for blank in non-Malaise trap samples\n",
" date_coll = validate_date('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" validate_float('DECIMAL_LATITUDE', df[~is_blank])\n",
" validate_float('DECIMAL_LONGITUDE', df[~is_blank])\n",
" validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])\n",
" validate_regex('DECIMAL_LATITUDE', df[~is_blank], na_values=[])\n",
" validate_regex('DECIMAL_LONGITUDE', df[~is_blank], na_values=[])\n",
" # COUNTRY_OF_COLLECTION, DECIMAL_LATITUDE, DECIMAL_LONGITUDE\n",
" validate_country_and_coordinates(df[~is_blank], fn, na_values=[''], bioscan=True)\n",
" # COLLECTION_LOCATION not checked\n",
" \n",
" # purple cols - valiated for non-blank samples\n",
" validate_wtw('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n",
" validate_time('TIME_OF_COLLECTION', df[~is_blank], bioscan=True)\n",
" validate_time_period('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])\n",
" validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)\n",
" date_plat = validate_date('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" compare_dates(before=date_coll, after=date_plat)\n",
" validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n",
" validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict, na_values=[''])\n",
" df = strip_asterisks('COLLECTION_METHOD', df)\n",
" validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
" compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PLATING', df[~is_blank])\n",
" # taxonomy validation adds taxid columns to original dataframe - skipping for now\n",
" df = validate_taxonomy(df, ncbi, anospp=False, na_values = [''])\n",
" validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict, na_values=[''])\n",
Expand All @@ -83,7 +93,8 @@
" validate_values('SORTING_SOLUTION_USED', df[~is_blank], valid_dict, na_values=[''])\n",
" validate_values('CATCH_BOTTLE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])\n",
" validate_values('PLATE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])\n",
" \n",
" if v == 'v3':\n",
" validate_values('AMOUNT_OF_CATCH_PLATED', df[~is_blank], valid_dict)\n",
" # white cols - validated for all samples\n",
" validate_freetext('MORPHOSPECIES_DESCRIPTION', df)\n",
" validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df)\n",
Expand All @@ -92,34 +103,38 @@
" # TODO check if STS will need something here\n",
" validate_freetext('COLLECTOR_SAMPLE_ID', df)\n",
" validate_freetext('VOUCHER_ID', df)\n",
" validate_float('ELEVATION', df, na_values=[''])\n",
" validate_regex('ELEVATION', df, na_values=[''])\n",
" validate_freetext('OTHER_INFORMATION', df)\n",
" validate_freetext('MISC_METADATA', df)\n",
" # IDENTIFIED_BY not checked\n",
" # IDENTIFIER_AFFILIATION not checked\n",
" validate_identifier('IDENTIFIED_BY', df, contrib_df, na_values=[''])\n",
" \n",
" df = expand_plate_only(df)\n",
" df = add_sts_cols(df, contrib_df, gal)\n",
" df = add_sts_cols(df, contrib_df, gal, bioscan=True, v=v)\n",
" \n",
" # keep filename operations together\n",
" validate_input_filename(fn, partner_code, v)\n",
" if write_sts:\n",
" write_sts_manifest(df, fn, VALIDATION_VERSION)\n",
" \n",
" logging.warning('# ended validate_bioscan_partner_manifest_v.{}'.format(BIOSCAN_VERSION))\n",
" logging.info('# ended validation of bioscan partner manifest')\n",
" \n",
" print('\\n'.join(df.RACK_OR_PLATE_ID.unique()))\n",
"\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "230f6339",
"id": "c9b0c5c4",
"metadata": {},
"outputs": [],
"source": [
"# fn = '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'\n",
"df = validate_bioscan(fn='../data/BIOSCAN_Manifest_V2.0_20230727.xlsx', \n",
" template_fn='../data/BIOSCAN_Manifest_V2.0_20230727.xlsx', \n",
"df = validate_bioscan(fn='../data/BIOSCAN_Manifest_V3_20230818.xlsx', \n",
" template_fn='../data/BIOSCAN_Manifest_V3_20230818.xlsx', \n",
" verbose=False,\n",
" samples_sheet='TAB 5 DO NOT EDIT - TEST Met',\n",
" samples_sheet='TAB 6 TEST',\n",
" write_sts=True)"
]
},
Expand All @@ -133,21 +148,29 @@
"raise Exception('Setup complete')"
]
},
{
"cell_type": "markdown",
"id": "a030e1bb",
"metadata": {},
"source": [
"## Validation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b170f12",
"metadata": {},
"outputs": [],
"source": [
"fn = '../results/20230712_js_batch1/[NENM]_[2307]_BIOSCAN_Manifest_V2.0.xlsx'\n",
"fn = '../results/20230817_js_batch8/NBGW-2023-Manifest-2023-08-14.xlsx'\n",
"df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43a0cf51",
"id": "6a82cc40",
"metadata": {},
"outputs": [],
"source": []
Expand Down
Loading

0 comments on commit 0326f77

Please sign in to comment.