Skip to content

Commit

Permalink
Merge pull request #5 from AI-sandbox/refactor-and-document
Browse files Browse the repository at this point in the history
Fix SNP writers related to pathlib. Refactor and document Ancestry Objects, SNPObject, and dim reduction classes
  • Loading branch information
salcc authored Nov 15, 2024
2 parents bb6fc3a + c69d673 commit 1c4bfe4
Show file tree
Hide file tree
Showing 12 changed files with 920 additions and 959 deletions.
155 changes: 104 additions & 51 deletions demos/SNPObj.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 1,
"id": "674d136a-1ded-4ce0-babc-87556d518b5f",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -248,7 +248,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "dcf2c069",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -281,7 +281,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"id": "7c64f7ca",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -311,7 +311,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "30807670",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -346,7 +346,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "9beadb42",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -384,7 +384,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"id": "4cf4d34e",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -416,21 +416,15 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"id": "89ae6d84",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering by indexes: 976599\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering by indexes: 976599\n",
"Number of SNPs after filtering by indexes: 3\n"
]
}
Expand Down Expand Up @@ -458,15 +452,21 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"id": "4fa41771",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n",
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples after filtering: ['sample_A', 'sample_B']\n"
]
}
Expand All @@ -490,21 +490,15 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"id": "ba4d2066",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n",
"Samples after filtering: ['sample_B', 'sample_C']\n"
]
}
Expand Down Expand Up @@ -657,7 +651,7 @@
}
],
"source": [
"snpobj_corrected = snpobj.correct_snp_variants(snpobj2, check_complement=True, index_by='pos', inplace=False)\n",
"snpobj_corrected = snpobj.correct_flipped_variants(snpobj2, check_complement=True, index_by='pos', inplace=False)\n",
"\n",
"print(\"SNP flips corrected.\")"
]
Expand Down Expand Up @@ -728,7 +722,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"First 5 variant positions after shuffling: [14258595 13978490 23721180 30763191 42617482]\n"
"First 5 variant positions after shuffling: [23614817 40647956 44287560 41652214 42829912]\n"
]
}
],
Expand Down Expand Up @@ -760,13 +754,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Variants_ref before handling empty entries: ['' 'G' 'G' 'C' 'A']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Variants_ref before handling empty entries: ['' 'G' 'G' 'C' 'A']\n",
"Variants_ref after handling empty entries: ['.' 'G' 'G' 'C' 'A']\n"
]
}
Expand Down Expand Up @@ -802,26 +790,74 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 29,
"id": "544b2955",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SNPObject saved to ../data/output.vcf\n"
"SNPObject saved to ../data/output.vcf\n",
"SNPObject saved to ../data/output.unphased\n"
]
}
],
"source": [
"# Define the path to save the VCF file\n",
"output_vcf_path = '../data/output.vcf'\n",
"output_vcf_path1 = '../data/output.vcf'\n",
"output_vcf_path2 = '../data/output.unphased'\n",
"\n",
"# Save the SNPObject as a VCF file\n",
"snpobj.save(output_vcf_path)\n",
"# Save the SNPObject as a VCF file (Option 1)\n",
"snpobj.save(output_vcf_path1)\n",
"print(f\"SNPObject saved to {output_vcf_path1}\")\n",
"\n",
"print(f\"SNPObject saved to {output_vcf_path}\")"
"# Save the SNPObject as a VCF file (Option 2)\n",
"snpobj.save_vcf(output_vcf_path2)\n",
"print(f\"SNPObject saved to {output_vcf_path2}\")"
]
},
{
"cell_type": "markdown",
"id": "a85bcfcf",
"metadata": {},
"source": [
"**Saving as PGEN**"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "08eff0b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n",
"INFO:snputils.snp.io.write.pgen:Writing ../data/output.psam\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pgen\n",
"SNPObject saved to ../data/output.pgen\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.unphased.pvar\n",
"INFO:snputils.snp.io.write.pgen:Writing ../data/output.unphased.psam\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.unphased.pgen\n",
"SNPObject saved to ../data/output.unphased\n"
]
}
],
"source": [
"# Define the path to save the BED file\n",
"output_pgen_path1 = '../data/output.pgen'\n",
"output_pgen_path2 = '../data/output.unphased'\n",
"\n",
"# Save the SNPObject as a PGEN file (option 1)\n",
"snpobj.save(output_pgen_path1)\n",
"print(f\"SNPObject saved to {output_pgen_path1}\")\n",
"\n",
"# Save the SNPObject as a PGEN file (option 2)\n",
"snpobj.save_pgen(output_pgen_path2)\n",
"print(f\"SNPObject saved to {output_pgen_path2}\")"
]
},
{
Expand All @@ -834,7 +870,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 31,
"id": "fbca1fd8",
"metadata": {},
"outputs": [
Expand All @@ -849,18 +885,30 @@
"INFO:snputils.snp.io.write.bed:Writing .bim file: ../data/output\n",
"WARNING:snputils.snp.io.write.bed:The .bim file is being saved with 0 cM values.\n",
"INFO:snputils.snp.io.write.bed:Finished writing .bim file: ../data/output\n",
"SNPObject saved to ../data/output.bed\n"
"SNPObject saved to ../data/output.bed\n",
"INFO:snputils.snp.io.write.bed:Writing .bed file: ../data/output.bed\n",
"INFO:snputils.snp.io.write.bed:Finished writing .bed file: ../data/output.bed\n",
"INFO:snputils.snp.io.write.bed:Writing .fam file: ../data/output\n",
"INFO:snputils.snp.io.write.bed:Finished writing .fam file: ../data/output\n",
"INFO:snputils.snp.io.write.bed:Writing .bim file: ../data/output\n",
"WARNING:snputils.snp.io.write.bed:The .bim file is being saved with 0 cM values.\n",
"INFO:snputils.snp.io.write.bed:Finished writing .bim file: ../data/output\n",
"SNPObject saved to ../data/output.unphased\n"
]
}
],
"source": [
"# Define the path to save the BED file\n",
"output_bed_path = '../data/output.bed'\n",
"output_bed_path1 = '../data/output.bed'\n",
"output_bed_path2 = '../data/output.unphased'\n",
"\n",
"# Save the SNPObject as a BED file\n",
"snpobj.save(output_bed_path)\n",
"# Save the SNPObject as a BED file (option 1)\n",
"snpobj.save(output_bed_path1)\n",
"print(f\"SNPObject saved to {output_bed_path1}\")\n",
"\n",
"print(f\"SNPObject saved to {output_bed_path}\")"
"# Save the SNPObject as a BED file (option 2)\n",
"snpobj.save_bed(output_bed_path2)\n",
"print(f\"SNPObject saved to {output_bed_path2}\")"
]
},
{
Expand All @@ -873,26 +921,31 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 33,
"id": "cea856ad",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SNPObject saved to ../data/output.pkl\n"
"SNPObject saved to ../data/output.pkl\n",
"SNPObject saved to ../data/output.unphased\n"
]
}
],
"source": [
"# Define the path to save the pickle file\n",
"output_pkl_path = '../data/output.pkl'\n",
"output_pkl_path1 = '../data/output.pkl'\n",
"output_pkl_path2 = '../data/output.unphased'\n",
"\n",
"# Save the SNPObject as a pickle file\n",
"snpobj.save(output_pkl_path)\n",
"# Save the SNPObject as a pickle file (option 1)\n",
"snpobj.save(output_pkl_path1)\n",
"print(f\"SNPObject saved to {output_pkl_path1}\")\n",
"\n",
"print(f\"SNPObject saved to {output_pkl_path}\")"
"# Save the SNPObject as a pickle file (option 2)\n",
"snpobj.save_pickle(output_pkl_path2)\n",
"print(f\"SNPObject saved to {output_pkl_path2}\")"
]
},
{
Expand Down
8 changes: 4 additions & 4 deletions demos/SNP_PCA.ipynb

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions demos/TorchPCA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 7,
"id": "2fd07cb3",
"metadata": {
"scrolled": true
Expand Down Expand Up @@ -48,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 8,
"id": "abc98c7c",
"metadata": {
"scrolled": true
Expand Down Expand Up @@ -95,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 9,
"id": "fa643a80",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -137,7 +137,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 10,
"id": "dfd07a81",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -188,7 +188,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 11,
"id": "0e1a6754",
"metadata": {},
"outputs": [
Expand All @@ -197,7 +197,7 @@
"output_type": "stream",
"text": [
"Using device: cpu\n",
"PCA completed. Data shape: torch.Size([4, 976599]), Time taken: 0.108 seconds\n",
"PCA completed. Data shape: torch.Size([4, 976599]), Time taken: 0.078 seconds\n",
"PCA result shape: torch.Size([4, 2])\n"
]
}
Expand Down Expand Up @@ -233,7 +233,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 12,
"id": "65b4a232",
"metadata": {
"scrolled": true
Expand Down
Loading

0 comments on commit 1c4bfe4

Please sign in to comment.