Skip to content

Commit

Permalink
Merge pull request #7 from amakunin/dev
Browse files Browse the repository at this point in the history
v0.3.1
  • Loading branch information
amakunin authored Mar 4, 2024
2 parents 0326f77 + f32fd59 commit 49bb41a
Show file tree
Hide file tree
Showing 5 changed files with 364 additions and 66 deletions.
Binary file not shown.
18 changes: 17 additions & 1 deletion data/bioscan_partners.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
partner_code gal
CAMP Sanger Institute
OXWW UNIVERSITY OF OXFORD
YARN NATURAL ENGLAND (EAST DARTMOOR NNR)
SHAP NATURAL ENGLAND (SHAPWICK HEATH NNR)
Expand Down Expand Up @@ -39,3 +38,20 @@ CARM SUFFOLK WILDLIFE TRUST (CARLTON MARSHES)
LFLA SUFFOLK WILDLIFE TRUST (LACKFORD LAKES)
RAIM RSPB (RAINHAM MARSHES)
NIDD UNIVERSITY OF LEEDS
ZSEA ZOOLOGICAL SOCIETY OF EAST ANGLIA
POMS UK CENTRE FOR ECOLOGY AND HYDROLOGY
SWLT SUSSEX WILDLIFE TRUST
NNPA NORTHUMBERLAND NATIONAL PARK
HEPP HEPPLE ESTATE
NWWT NORTH WALES WILDLIFE TRUST
HLNR HOLY LOCH NATURE RESERVE
BGPT UNIVERSIDADE DO PORTO
BGKU V. N. KARAZIN KHARKIV NATIONAL UNIVERSITY
BGEG ARISTOTLE UNIVERSITY OF THESSALONIKI
CAMP WELLCOME SANGER INSTITUTE
CEHA UK CENTRE FOR ECOLOGY AND HYDROLOGY (AUCHENCORTH)
YWTP YORKSHIRE WILDLIFE TRUST (POTTERIC CARR)
CZRC CHESTER ZOO RECORD CHESTER
CEHG UK CENTRE FOR ECOLOGY AND HYDROLOGY (GAIT BARROWS)
UPBM UNIVERSITY OF PLYMOUTH
LJMU LIVERPOOL JOHN MOORES UNIVERSITY
8 changes: 5 additions & 3 deletions work/validate_anospp.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@
" \n",
" # orange cols\n",
" # RACK_OR_PLATE_ID, TUBE_OR_WELL_ID\n",
" df, gal = validate_plates_wells(df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')\n",
" df, gal, partner_code = validate_plates_wells(df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')\n",
" # check blanks\n",
" is_blank = check_blanks(df)\n",
" df, is_blank = check_blanks(df)\n",
" if df[~is_blank].shape[0] == 0:\n",
" logging.error('no non-blank samples to validate, terminating')\n",
" return df\n",
Expand Down Expand Up @@ -119,6 +119,8 @@
"\n",
" logging.info('# ended validate_anospp_partner_manifest')\n",
" \n",
" print('\\n'.join(df.RACK_OR_PLATE_ID.unique()))\n",
" \n",
" return df\n",
"\n",
"fn = '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx'\n",
Expand Down Expand Up @@ -160,7 +162,7 @@
"metadata": {},
"outputs": [],
"source": [
"fn = '../results/20230814_olaitan_2_re_re/Anopheles_Metadata_Manifest_V4.0_EKUW - 14.08.2023 - antivec samples.xlsx'\n",
"fn = '../results/20231019_a_poal_adad/Anopheles_Metadata_Manifest_V4.0_POAL_101623_am60.xlsx'\n",
"df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors')"
]
},
Expand Down
67 changes: 51 additions & 16 deletions work/validate_bioscan.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
"metadata": {},
"outputs": [],
"source": [
"def validate_bioscan(fn, template_fn='../data/BIOSCAN_Manifest_V3_20230818.xlsx', \n",
"def validate_bioscan(fn, template_fn='../data/BIOSCAN_Manifest_V3_20240301.xlsx', \n",
" samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', \n",
" verbose=False, write_sts=True):\n",
" verbose=False, write_sts=True, bold_input=False):\n",
" '''\n",
" BIOSCAN partner manifest validation\n",
" Validation follows the order of columns order in data entry sheet\n",
Expand All @@ -32,7 +32,11 @@
" logging.info(f'# manifest \"{fn}\"')\n",
"\n",
" # read data\n",
" df = get_data(fn, sheet=samples_sheet)\n",
" template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')\n",
" if bold_input:\n",
" df = parse_bold(fn, template_df, sheet='BGE entry')\n",
" else:\n",
" df = get_data(fn, sheet=samples_sheet)\n",
" df = fix_date_formats(df)\n",
" # check manifest verison\n",
" v = infer_bioscan_version(df)\n",
Expand All @@ -46,30 +50,44 @@
" ncbi = ete3.NCBITaxa()\n",
" \n",
" # prepare for validation\n",
" template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')\n",
" check_columns(df, template_df, bioscan_version=v)\n",
" valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4 DO NOT EDIT - Data Valida')\n",
" contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)\n",
" # update validation dictionary from v3 to v2\n",
" if v == 'v2':\n",
" valid_dict['SORTING_SOLUTION_USED'] += ['N']\n",
" if bold_input:\n",
" contrib_df = validate_contributors(template_fn, contrib_sheet=contrib_sheet)\n",
" else:\n",
" contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)\n",
"\n",
" # orange cols\n",
" validate_regex('CATCH_LOT', df, na_values=[])\n",
" \n",
" df, gal, partner_code = validate_plates_wells(\n",
" df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True, bioscan_version=v)\n",
" # check and exclude blanks\n",
" is_blank = check_blanks(df)\n",
" df, is_blank = check_blanks(df, bioscan=True)\n",
" if df[~is_blank].shape[0] == 0:\n",
" logging.error('no non-blank samples to validate, terminating')\n",
" return df\n",
" # hope to get WING into STS ORGANISM_PART values later, fixing on our side for now\n",
" if df['ORGANISM_PART'].str.contains('WING').any():\n",
" logging.warning('replacing ORGANISM_PART \"WING\" entries with “**OTHER_SOMATIC_ANIMAL_TISSUE**”')\n",
" df['ORGANISM_PART'] = df['ORGANISM_PART'] \\\n",
" .str.replace('WING', '**OTHER_SOMATIC_ANIMAL_TISSUE**', regex=False)\n",
" validate_values('ORGANISM_PART', df, valid_dict, sep='|')\n",
" df = strip_asterisks('ORGANISM_PART', df)\n",
"# df = strip_asterisks('ORGANISM_PART', df)\n",
" validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)\n",
" df = strip_asterisks('PRESERVATIVE_SOLUTION', df)\n",
"# df = strip_asterisks('PRESERVATIVE_SOLUTION', df)\n",
" # columns below validated for non-blank samples only\n",
" if v == 'v3':\n",
" validate_values('CATCH_SOLUTION', df[~is_blank], valid_dict)\n",
" df = strip_asterisks('CATCH_SOLUTION', df)\n",
" validate_values('BOTTLE_DIRECTION', df[~is_blank], valid_dict) # TODO allow for blank in non-Malaise trap samples\n",
" validate_values('BOTTLE_DIRECTION', \n",
" df[~is_blank & (df['COLLECTION_METHOD'] == 'MALAISE_TRAP')], # allow for blank in non-Malaise trap samples\n",
" valid_dict)\n",
" validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])\n",
" check_catch_lot_dates(df[~is_blank])\n",
" validate_regex('DECIMAL_LATITUDE', df[~is_blank], na_values=[])\n",
" validate_regex('DECIMAL_LONGITUDE', df[~is_blank], na_values=[])\n",
" # COUNTRY_OF_COLLECTION, DECIMAL_LATITUDE, DECIMAL_LONGITUDE\n",
Expand Down Expand Up @@ -110,6 +128,10 @@
" \n",
" df = expand_plate_only(df)\n",
" df = add_sts_cols(df, contrib_df, gal, bioscan=True, v=v)\n",
" if bold_input:\n",
" logging.info('Replacing CONTRIBUTORS with IDENTIFIED_BY for BOLD manifest')\n",
" df['CONTRIBUTORS'] = df['IDENTIFIED_BY']\n",
" \n",
" \n",
" # keep filename operations together\n",
" validate_input_filename(fn, partner_code, v)\n",
Expand All @@ -130,9 +152,8 @@
"metadata": {},
"outputs": [],
"source": [
"# fn = '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'\n",
"df = validate_bioscan(fn='../data/BIOSCAN_Manifest_V3_20230818.xlsx', \n",
" template_fn='../data/BIOSCAN_Manifest_V3_20230818.xlsx', \n",
"df = validate_bioscan(fn='../data/BIOSCAN_Manifest_V3_20240301.xlsx', \n",
" template_fn='../data/BIOSCAN_Manifest_V3_20240301.xlsx', \n",
" verbose=False,\n",
" samples_sheet='TAB 6 TEST',\n",
" write_sts=True)"
Expand All @@ -159,18 +180,32 @@
{
"cell_type": "code",
"execution_count": null,
"id": "6b170f12",
"metadata": {},
"id": "6a82cc40",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"fn = '../results/20230817_js_batch8/NBGW-2023-Manifest-2023-08-14.xlsx'\n",
"fn = '../results/20240304_shap_test/SHAP_2401_BIOSCAN_Manifest_V3.xlsx'\n",
"df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a82cc40",
"id": "0304ddd6",
"metadata": {},
"outputs": [],
"source": [
"fn = '../results/20231006_bold_test/BGE_single_specimen_metadata&biobanking_sheet_v5_Yulia_Guglia_Diptera_12_plates.xlsx'\n",
"df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', \n",
" verbose=False, bold_input=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40a9e9aa",
"metadata": {},
"outputs": [],
"source": []
Expand Down
Loading

0 comments on commit 49bb41a

Please sign in to comment.