Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SNP writers related to pathlib. Refactor and document Ancestry Objects, SNPObject, and dim reduction classes #5

Merged
merged 11 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 104 additions & 51 deletions demos/SNPObj.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 1,
"id": "674d136a-1ded-4ce0-babc-87556d518b5f",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -248,7 +248,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "dcf2c069",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -281,7 +281,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"id": "7c64f7ca",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -311,7 +311,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "30807670",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -346,7 +346,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "9beadb42",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -384,7 +384,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"id": "4cf4d34e",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -416,21 +416,15 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"id": "89ae6d84",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering by indexes: 976599\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering by indexes: 976599\n",
"Number of SNPs after filtering by indexes: 3\n"
]
}
Expand Down Expand Up @@ -458,15 +452,21 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"id": "4fa41771",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n",
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples after filtering: ['sample_A', 'sample_B']\n"
]
}
Expand All @@ -490,21 +490,15 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"id": "ba4d2066",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n",
"Samples after filtering: ['sample_B', 'sample_C']\n"
]
}
Expand Down Expand Up @@ -657,7 +651,7 @@
}
],
"source": [
"snpobj_corrected = snpobj.correct_snp_variants(snpobj2, check_complement=True, index_by='pos', inplace=False)\n",
"snpobj_corrected = snpobj.correct_flipped_variants(snpobj2, check_complement=True, index_by='pos', inplace=False)\n",
"\n",
"print(\"SNP flips corrected.\")"
]
Expand Down Expand Up @@ -728,7 +722,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"First 5 variant positions after shuffling: [14258595 13978490 23721180 30763191 42617482]\n"
"First 5 variant positions after shuffling: [23614817 40647956 44287560 41652214 42829912]\n"
]
}
],
Expand Down Expand Up @@ -760,13 +754,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Variants_ref before handling empty entries: ['' 'G' 'G' 'C' 'A']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Variants_ref before handling empty entries: ['' 'G' 'G' 'C' 'A']\n",
"Variants_ref after handling empty entries: ['.' 'G' 'G' 'C' 'A']\n"
]
}
Expand Down Expand Up @@ -802,26 +790,74 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 29,
"id": "544b2955",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SNPObject saved to ../data/output.vcf\n"
"SNPObject saved to ../data/output.vcf\n",
"SNPObject saved to ../data/output.unphased\n"
]
}
],
"source": [
"# Define the path to save the VCF file\n",
"output_vcf_path = '../data/output.vcf'\n",
"output_vcf_path1 = '../data/output.vcf'\n",
"output_vcf_path2 = '../data/output.unphased'\n",
"\n",
"# Save the SNPObject as a VCF file\n",
"snpobj.save(output_vcf_path)\n",
"# Save the SNPObject as a VCF file (Option 1)\n",
"snpobj.save(output_vcf_path1)\n",
"print(f\"SNPObject saved to {output_vcf_path1}\")\n",
"\n",
"print(f\"SNPObject saved to {output_vcf_path}\")"
"# Save the SNPObject as a VCF file (Option 2)\n",
"snpobj.save_vcf(output_vcf_path2)\n",
"print(f\"SNPObject saved to {output_vcf_path2}\")"
]
},
{
"cell_type": "markdown",
"id": "a85bcfcf",
"metadata": {},
"source": [
"**Saving as PGEN**"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "08eff0b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n",
"INFO:snputils.snp.io.write.pgen:Writing ../data/output.psam\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pgen\n",
"SNPObject saved to ../data/output.pgen\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.unphased.pvar\n",
"INFO:snputils.snp.io.write.pgen:Writing ../data/output.unphased.psam\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.unphased.pgen\n",
"SNPObject saved to ../data/output.unphased\n"
]
}
],
"source": [
"# Define the path to save the BED file\n",
"output_pgen_path1 = '../data/output.pgen'\n",
"output_pgen_path2 = '../data/output.unphased'\n",
"\n",
"# Save the SNPObject as a PGEN file (option 1)\n",
"snpobj.save(output_pgen_path1)\n",
"print(f\"SNPObject saved to {output_pgen_path1}\")\n",
"\n",
"# Save the SNPObject as a PGEN file (option 2)\n",
"snpobj.save_pgen(output_pgen_path2)\n",
"print(f\"SNPObject saved to {output_pgen_path2}\")"
]
},
{
Expand All @@ -834,7 +870,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 31,
"id": "fbca1fd8",
"metadata": {},
"outputs": [
Expand All @@ -849,18 +885,30 @@
"INFO:snputils.snp.io.write.bed:Writing .bim file: ../data/output\n",
"WARNING:snputils.snp.io.write.bed:The .bim file is being saved with 0 cM values.\n",
"INFO:snputils.snp.io.write.bed:Finished writing .bim file: ../data/output\n",
"SNPObject saved to ../data/output.bed\n"
"SNPObject saved to ../data/output.bed\n",
"INFO:snputils.snp.io.write.bed:Writing .bed file: ../data/output.bed\n",
"INFO:snputils.snp.io.write.bed:Finished writing .bed file: ../data/output.bed\n",
"INFO:snputils.snp.io.write.bed:Writing .fam file: ../data/output\n",
"INFO:snputils.snp.io.write.bed:Finished writing .fam file: ../data/output\n",
"INFO:snputils.snp.io.write.bed:Writing .bim file: ../data/output\n",
"WARNING:snputils.snp.io.write.bed:The .bim file is being saved with 0 cM values.\n",
"INFO:snputils.snp.io.write.bed:Finished writing .bim file: ../data/output\n",
"SNPObject saved to ../data/output.unphased\n"
]
}
],
"source": [
"# Define the path to save the BED file\n",
"output_bed_path = '../data/output.bed'\n",
"output_bed_path1 = '../data/output.bed'\n",
"output_bed_path2 = '../data/output.unphased'\n",
"\n",
"# Save the SNPObject as a BED file\n",
"snpobj.save(output_bed_path)\n",
"# Save the SNPObject as a BED file (option 1)\n",
"snpobj.save(output_bed_path1)\n",
"print(f\"SNPObject saved to {output_bed_path1}\")\n",
"\n",
"print(f\"SNPObject saved to {output_bed_path}\")"
"# Save the SNPObject as a BED file (option 2)\n",
"snpobj.save_bed(output_bed_path2)\n",
"print(f\"SNPObject saved to {output_bed_path2}\")"
]
},
{
Expand All @@ -873,26 +921,31 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 33,
"id": "cea856ad",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SNPObject saved to ../data/output.pkl\n"
"SNPObject saved to ../data/output.pkl\n",
"SNPObject saved to ../data/output.unphased\n"
]
}
],
"source": [
"# Define the path to save the pickle file\n",
"output_pkl_path = '../data/output.pkl'\n",
"output_pkl_path1 = '../data/output.pkl'\n",
"output_pkl_path2 = '../data/output.unphased'\n",
"\n",
"# Save the SNPObject as a pickle file\n",
"snpobj.save(output_pkl_path)\n",
"# Save the SNPObject as a pickle file (option 1)\n",
"snpobj.save(output_pkl_path1)\n",
"print(f\"SNPObject saved to {output_pkl_path1}\")\n",
"\n",
"print(f\"SNPObject saved to {output_pkl_path}\")"
"# Save the SNPObject as a pickle file (option 2)\n",
"snpobj.save_pickle(output_pkl_path2)\n",
"print(f\"SNPObject saved to {output_pkl_path2}\")"
]
},
{
Expand Down
8 changes: 4 additions & 4 deletions demos/SNP_PCA.ipynb

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions demos/TorchPCA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 7,
"id": "2fd07cb3",
"metadata": {
"scrolled": true
Expand Down Expand Up @@ -48,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 8,
"id": "abc98c7c",
"metadata": {
"scrolled": true
Expand Down Expand Up @@ -95,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 9,
"id": "fa643a80",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -137,7 +137,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 10,
"id": "dfd07a81",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -188,7 +188,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 11,
"id": "0e1a6754",
"metadata": {},
"outputs": [
Expand All @@ -197,7 +197,7 @@
"output_type": "stream",
"text": [
"Using device: cpu\n",
"PCA completed. Data shape: torch.Size([4, 976599]), Time taken: 0.108 seconds\n",
"PCA completed. Data shape: torch.Size([4, 976599]), Time taken: 0.078 seconds\n",
"PCA result shape: torch.Size([4, 2])\n"
]
}
Expand Down Expand Up @@ -233,7 +233,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 12,
"id": "65b4a232",
"metadata": {
"scrolled": true
Expand Down
Loading