From e345f6bdf8db72b4bc8a13452af9dc1486db42a2 Mon Sep 17 00:00:00 2001 From: mariacuria Date: Tue, 26 Nov 2024 14:53:47 -0500 Subject: [PATCH 1/7] Tidy up directory --- .../convert_step2/cbioportal/{canonical.py => 2_canonical.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pipeline/convert_step2/cbioportal/{canonical.py => 2_canonical.py} (100%) diff --git a/pipeline/convert_step2/cbioportal/canonical.py b/pipeline/convert_step2/cbioportal/2_canonical.py similarity index 100% rename from pipeline/convert_step2/cbioportal/canonical.py rename to pipeline/convert_step2/cbioportal/2_canonical.py From 2da1abeaa972fccee41edc21c5d7d0ef2c1de00d Mon Sep 17 00:00:00 2001 From: mariacuria Date: Wed, 27 Nov 2024 10:18:06 -0500 Subject: [PATCH 2/7] Compare fasta sequences initial commit --- .../convert_step2/cbioportal/2_canonical.py | 4 +- .../convert_step2/cbioportal/compare_fasta.py | 38 +++++++++++++++++++ .../cbioportal/compare_fasta_test.py | 12 ++++++ 3 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 pipeline/convert_step2/cbioportal/compare_fasta.py create mode 100644 pipeline/convert_step2/cbioportal/compare_fasta_test.py diff --git a/pipeline/convert_step2/cbioportal/2_canonical.py b/pipeline/convert_step2/cbioportal/2_canonical.py index 8fb9d95..3149272 100644 --- a/pipeline/convert_step2/cbioportal/2_canonical.py +++ b/pipeline/convert_step2/cbioportal/2_canonical.py @@ -2,7 +2,7 @@ import pandas as pd # Load the ENSP to UniProt mapping JSON -with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json", "r") as f: +with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json", "r") as f: ensp_to_uniprot = json.load(f) # Load the isoform data CSV @@ -34,5 +34,5 @@ def strip_suffix(identifier): break # Exit inner loop once the first match is found # Write the result to a JSON file -with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical.json", "w") as json_file: +with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json", "w") as json_file: json.dump(result, json_file, indent=4) diff --git a/pipeline/convert_step2/cbioportal/compare_fasta.py b/pipeline/convert_step2/cbioportal/compare_fasta.py new file mode 100644 index 0000000..a59ecb0 --- /dev/null +++ b/pipeline/convert_step2/cbioportal/compare_fasta.py @@ -0,0 +1,38 @@ +import requests +import json + +# Load your JSON file containing ENSEMBL to UniProt mappings +with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json") as file: + ensembl_uniprot_map = json.load(file) + +def fetch_ensembl_sequence(ensembl_id): #it works + url = f"https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/plain" + #https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/x-fasta" + response = requests.get(url) + if response.status_code == 200: + return response.text.strip() + else: + print(f"Failed to fetch ENSEMBL sequence for {ensembl_id}") + return None + +def fetch_uniprot_sequence(uniprot_id): #it works + url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta" + response = requests.get(url) + if response.status_code == 200: + return response.text.split('\n', 1)[1].replace('\n', '') + else: + print(f"Failed to fetch UniProt sequence for {uniprot_id}") + return None + +# Compare sequences +for ensembl_id, uniprot_id in ensembl_uniprot_map.items(): + ensembl_sequence = fetch_ensembl_sequence(ensembl_id) + uniprot_sequence = fetch_uniprot_sequence(uniprot_id) + + if ensembl_sequence and uniprot_sequence: + if ensembl_sequence == uniprot_sequence: + print(f"Sequences match for {ensembl_id} and {uniprot_id}") + else: + print(f"Sequences do not match for {ensembl_id} and {uniprot_id}") + else: + print(f"Could not compare sequences for {ensembl_id} and {uniprot_id}") diff --git a/pipeline/convert_step2/cbioportal/compare_fasta_test.py b/pipeline/convert_step2/cbioportal/compare_fasta_test.py new file mode 100644 index 0000000..237daee --- /dev/null +++ b/pipeline/convert_step2/cbioportal/compare_fasta_test.py @@ -0,0 +1,12 @@ +import requests + +def fetch_uniprot_sequence(uniprot_id): + url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta" + response = requests.get(url) + if response.status_code == 200: + return response.text.split('\n', 1)[1].replace('\n', '') + else: + print(f"Failed to fetch UniProt sequence for {uniprot_id}") + return None + +print(fetch_uniprot_sequence('Q9NXB0')) \ No newline at end of file From 56b0686bf8f5b7f45c1481af37d3ae998da98b15 Mon Sep 17 00:00:00 2001 From: mariacuria Date: Wed, 27 Nov 2024 11:24:49 -0500 Subject: [PATCH 3/7] Write whether or not a given ENSP is canonical --- .../cbioportal/2_canonical_yes_no.py | 46 +++++++++++++++++++ .../{compare_fasta.py => 3_compare_fasta.py} | 0 2 files changed, 46 insertions(+) create mode 100644 pipeline/convert_step2/cbioportal/2_canonical_yes_no.py rename pipeline/convert_step2/cbioportal/{compare_fasta.py => 3_compare_fasta.py} (100%) diff --git a/pipeline/convert_step2/cbioportal/2_canonical_yes_no.py b/pipeline/convert_step2/cbioportal/2_canonical_yes_no.py new file mode 100644 index 0000000..492ba1a --- /dev/null +++ b/pipeline/convert_step2/cbioportal/2_canonical_yes_no.py @@ -0,0 +1,46 @@ +import json +import pandas as pd + +# Load the ENSP to UniProt mapping JSON +with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json", "r") as f: + ensp_to_uniprot = json.load(f) + +# Load the isoform data CSV +isoform_data = pd.read_csv("/data/shared/repos/biomuta-old/downloads/glygen/human_protein_masterlist.csv") + +# Prepare a dictionary to store the results +result = {} + +# Function to strip suffixes (anything after a hyphen) from both isoform IDs and UniProtKB Canonical ACs +def strip_suffix(identifier): + if isinstance(identifier, str) and '-' in identifier: + return identifier.split('-')[0] # Strip everything after the first hyphen + return identifier + +# Iterate over each ENSP and its corresponding UniProt ID +for ensp, uniprot in ensp_to_uniprot.items(): + # Default to "no" for canonical until proven otherwise + is_canonical = "no" + + # Check for matching UniProt IDs in either reviewed_isoforms or unreviewed_isoforms + for _, entry in isoform_data.iterrows(): + # Strip suffixes from isoform IDs before comparison + reviewed_isoforms = strip_suffix(entry.get("reviewed_isoforms", "")) + unreviewed_isoforms = strip_suffix(entry.get("unreviewed_isoforms", "")) + + # Check if the UniProt ID matches any isoform (stripped version) + if uniprot == reviewed_isoforms or uniprot == unreviewed_isoforms: + # If a match is found, add the uniprotkb_canonical_ac to the result + uniprotkb_ac = strip_suffix(entry.get("uniprotkb_canonical_ac")) + + # Check if the UniProt ID matches the canonical version + if uniprot == uniprotkb_ac: + is_canonical = "yes" + + # Store the result with canonical status + result[ensp] = {"uniprotkb_canonical_ac": uniprotkb_ac, "canonical": is_canonical} + break # Exit inner loop once the first match is found + +# Write the result to a JSON file +with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json", "w") as json_file: + json.dump(result, json_file, indent=4) diff --git a/pipeline/convert_step2/cbioportal/compare_fasta.py b/pipeline/convert_step2/cbioportal/3_compare_fasta.py similarity index 100% rename from pipeline/convert_step2/cbioportal/compare_fasta.py rename to pipeline/convert_step2/cbioportal/3_compare_fasta.py From a077d36187f56dec1de9ad357b14d102d95c0aea Mon Sep 17 00:00:00 2001 From: mariacuria Date: Wed, 27 Nov 2024 14:15:54 -0500 Subject: [PATCH 4/7] Moved scripts to proper directories --- .../cbioportal/1_generate_cancer_do_json.py} | 0 .../convert_step2/cbioportal/2_canonical.py | 38 ------ ...ot_batch.sh => 2_ensp_to_uniprot_batch.sh} | 0 ...onical_yes_no.py => 3_canonical_yes_no.py} | 0 ...{3_compare_fasta.py => 4_compare_fasta.py} | 0 .../liftover/3_get_ensp_ver2_toy.py | 112 ++++++++++++++++++ .../cbioportal/find_incomplete.py | 2 +- .../call_config_with_py.txt | 0 .../cbioportal => helper_scripts}/cleanup.sh | 0 .../combined_mapping.py | 0 .../extract_uniq_do_names.py | 2 +- .../find_duplicates.sh | 0 .../instructions.md | 0 .../instructions.py | 0 14 files changed, 114 insertions(+), 40 deletions(-) rename pipeline/{download_step1/cbioportal/generate_cancer_do_json.py => convert_step2/cbioportal/1_generate_cancer_do_json.py} (100%) delete mode 100644 pipeline/convert_step2/cbioportal/2_canonical.py rename pipeline/convert_step2/cbioportal/{1_ensp_to_uniprot_batch.sh => 2_ensp_to_uniprot_batch.sh} (100%) rename pipeline/convert_step2/cbioportal/{2_canonical_yes_no.py => 3_canonical_yes_no.py} (100%) rename pipeline/convert_step2/cbioportal/{3_compare_fasta.py => 4_compare_fasta.py} (100%) create mode 100644 pipeline/convert_step2/liftover/3_get_ensp_ver2_toy.py rename pipeline/{download_step1/cbioportal => helper_scripts}/call_config_with_py.txt (100%) rename pipeline/{download_step1/cbioportal => helper_scripts}/cleanup.sh (100%) rename pipeline/{download_step1/cbioportal => helper_scripts}/combined_mapping.py (100%) rename pipeline/{download_step1/cbioportal => helper_scripts}/extract_uniq_do_names.py (85%) rename pipeline/{download_step1/cbioportal => helper_scripts}/find_duplicates.sh (100%) rename pipeline/{download_step1/cbioportal => helper_scripts}/instructions.md (100%) rename pipeline/{download_step1/cbioportal => helper_scripts}/instructions.py (100%) diff --git a/pipeline/download_step1/cbioportal/generate_cancer_do_json.py b/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py similarity index 100% rename from pipeline/download_step1/cbioportal/generate_cancer_do_json.py rename to pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py diff --git a/pipeline/convert_step2/cbioportal/2_canonical.py b/pipeline/convert_step2/cbioportal/2_canonical.py deleted file mode 100644 index 3149272..0000000 --- a/pipeline/convert_step2/cbioportal/2_canonical.py +++ /dev/null @@ -1,38 +0,0 @@ -import json -import pandas as pd - -# Load the ENSP to UniProt mapping JSON -with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json", "r") as f: - ensp_to_uniprot = json.load(f) - -# Load the isoform data CSV -isoform_data = pd.read_csv("/data/shared/repos/biomuta-old/downloads/glygen/human_protein_masterlist.csv") - -# Prepare a dictionary to store the results -result = {} - -# Function to strip suffixes (anything after a hyphen) from both isoform IDs and UniProtKB Canonical ACs -def strip_suffix(identifier): - if isinstance(identifier, str) and '-' in identifier: - return identifier.split('-')[0] # Strip everything after the first hyphen - return identifier - -# Iterate over each ENSP and its corresponding UniProt ID -for ensp, uniprot in ensp_to_uniprot.items(): - # Check for matching UniProt IDs in either reviewed_isoforms or unreviewed_isoforms - for _, entry in isoform_data.iterrows(): - # Strip suffixes from isoform IDs before comparison - reviewed_isoforms = strip_suffix(entry.get("reviewed_isoforms", "")) - unreviewed_isoforms = strip_suffix(entry.get("unreviewed_isoforms", "")) - - # Check if the UniProt ID matches any isoform (stripped version) - if uniprot == reviewed_isoforms or uniprot == unreviewed_isoforms: - # If a match is found, add the uniprotkb_canonical_ac to the result - uniprotkb_ac = strip_suffix(entry.get("uniprotkb_canonical_ac")) - # Store the first match found for each ENSP identifier - result[ensp] = uniprotkb_ac - break # Exit inner loop once the first match is found - -# Write the result to a JSON file -with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json", "w") as json_file: - json.dump(result, json_file, indent=4) diff --git a/pipeline/convert_step2/cbioportal/1_ensp_to_uniprot_batch.sh b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh similarity index 100% rename from pipeline/convert_step2/cbioportal/1_ensp_to_uniprot_batch.sh rename to pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh diff --git a/pipeline/convert_step2/cbioportal/2_canonical_yes_no.py b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py similarity index 100% rename from pipeline/convert_step2/cbioportal/2_canonical_yes_no.py rename to pipeline/convert_step2/cbioportal/3_canonical_yes_no.py diff --git a/pipeline/convert_step2/cbioportal/3_compare_fasta.py b/pipeline/convert_step2/cbioportal/4_compare_fasta.py similarity index 100% rename from pipeline/convert_step2/cbioportal/3_compare_fasta.py rename to pipeline/convert_step2/cbioportal/4_compare_fasta.py diff --git a/pipeline/convert_step2/liftover/3_get_ensp_ver2_toy.py b/pipeline/convert_step2/liftover/3_get_ensp_ver2_toy.py new file mode 100644 index 0000000..8d505c1 --- /dev/null +++ b/pipeline/convert_step2/liftover/3_get_ensp_ver2_toy.py @@ -0,0 +1,112 @@ +import csv +import pickle +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent.parent)) +from utils import ROOT_DIR +from utils.config import get_config + +# Load input_bed and ann +# Set input directory for JSON files and output file for BED format +config_obj = get_config() +wd = Path(config_obj["relevant_paths"]["generated_datasets"]) +input_bed = wd / '2024_10_22' / 'liftover' / 'hg38_combined_toy.bed' # Write a util to get latest dir +ann_dir = Path(config_obj["relevant_paths"]["downloads"]) +ann = ann_dir / 'ensembl' / 'Homo_sapiens.GRCh38.113.gff3' # GFF file with genomic features +output_file = wd / '2024_10_22' / 'mapping_ids' / 'chr_pos_to_ensp_toy.csv' + + +# Step 1: Load and Serialize 'ann' Annotations +def parse_and_serialize_ann(): + annotations = {} + print("Parsing annotations from ann...") + with open(ann, 'r') as f: + for line in f: + if not line.startswith('#'): + fields = line.strip().split('\t') + chrom = 'chr' + fields[0] # Add 'chr' prefix to match format in input_bed + feature_start = int(fields[3]) + feature_end = int(fields[4]) + if chrom not in annotations: + annotations[chrom] = [] + annotations[chrom].append((feature_start, feature_end, line.strip())) + with open('annotations.pkl', 'wb') as f: + pickle.dump(annotations, f) + print("Serialized annotations to 'annotations.pkl'") + +# Step 2: Load 'ann' from Serialized File +def load_annotations(): + with open('annotations.pkl', 'rb') as f: + return pickle.load(f) + +# Step 3: Process 'input_bed' and Write Results in Batches +def process_large_input_bed(): + # Load serialized annotations + annotations = load_annotations() + + # Open output CSV file + with open(output_file, 'w', newline='') as csvfile: + # Define the new headers for the output file + writer = csv.writer(csvfile) + writer.writerow(['chr_id', 'start_pos', 'end_pos', 'entrez_gene_id', 'prot_change', 'ENSP']) + + batch = [] + batch_size = 10000 # Define batch size for writing + + print("Processing SNP positions from input_bed and writing to CSV...") + + with open(input_bed, 'r') as f: + # Skip the header line (if the first line is a header) + header_skipped = False + + for i, line in enumerate(f, start=1): + # Skip the header line + if not header_skipped: + header_skipped = True + continue # Skip the header + + fields = line.strip().split('\t') + + # Check that the necessary fields are numeric before proceeding + try: + start = int(fields[1]) # start_pos + end = int(fields[2]) # end_pos + except ValueError: + print(f"Skipping invalid line {i}: {line.strip()}") + continue # Skip lines where start or end position is not numeric + + chrom = fields[0] # chr_id + entrez = fields[3] # entrez_gene_id + prot_change = fields[4] # prot_change + + # Find matching annotations + if chrom in annotations: + for feature_start, feature_end, annotation in annotations[chrom]: + if start >= feature_start and end <= feature_end: + ensp = None + for field in annotation.split(';'): + if 'protein_id=' in field: + ensp = field.split('=')[1] + break + if ensp: + # Add match to batch with renamed fields + batch.append([chrom, start, end, entrez, prot_change, ensp]) + + # Write batch to file every 'batch_size' records + if len(batch) >= batch_size: + writer.writerows(batch) + batch.clear() # Clear batch after writing to file + print(f"Processed {i} lines so far...") # Status update + + # Write remaining entries in the batch + if batch: + writer.writerows(batch) + print("Wrote remaining records to file.") + + print(f"Process completed. Results written to {output_file}") + + +# Run the workflow +parse_and_serialize_ann() # Run once to create the serialized annotations file if needed +process_large_input_bed() # Process large 'input_bed' and write results diff --git a/pipeline/download_step1/cbioportal/find_incomplete.py b/pipeline/download_step1/cbioportal/find_incomplete.py index b2b065b..a913a02 100644 --- a/pipeline/download_step1/cbioportal/find_incomplete.py +++ b/pipeline/download_step1/cbioportal/find_incomplete.py @@ -26,7 +26,7 @@ def find_incomplete_json_files(directory, output_file="incomplete_files.txt"): print(f"Found {len(incomplete_files)} incomplete files. Results saved to '{output_file}'.") # Specify the directory containing your JSON files -directory_path = "/data/shared/pipelines/cbioportal/mutations" +directory_path = "/data/shared/biomuta/downloads/cbioportal/2024_10_21/mutationss" # Run the function find_incomplete_json_files(directory_path) diff --git a/pipeline/download_step1/cbioportal/call_config_with_py.txt b/pipeline/helper_scripts/call_config_with_py.txt similarity index 100% rename from pipeline/download_step1/cbioportal/call_config_with_py.txt rename to pipeline/helper_scripts/call_config_with_py.txt diff --git a/pipeline/download_step1/cbioportal/cleanup.sh b/pipeline/helper_scripts/cleanup.sh similarity index 100% rename from pipeline/download_step1/cbioportal/cleanup.sh rename to pipeline/helper_scripts/cleanup.sh diff --git a/pipeline/download_step1/cbioportal/combined_mapping.py b/pipeline/helper_scripts/combined_mapping.py similarity index 100% rename from pipeline/download_step1/cbioportal/combined_mapping.py rename to pipeline/helper_scripts/combined_mapping.py diff --git a/pipeline/download_step1/cbioportal/extract_uniq_do_names.py b/pipeline/helper_scripts/extract_uniq_do_names.py similarity index 85% rename from pipeline/download_step1/cbioportal/extract_uniq_do_names.py rename to pipeline/helper_scripts/extract_uniq_do_names.py index 5525a22..1bec2fa 100644 --- a/pipeline/download_step1/cbioportal/extract_uniq_do_names.py +++ b/pipeline/helper_scripts/extract_uniq_do_names.py @@ -1,6 +1,6 @@ import json -# Replace 'path_to_your_file.json' with the path to your JSON file +# Path to JSON file file_path = '/data/shared/biomuta/generated/datasets/2024_10_22/cancer_types_with_do.json' # Open and load JSON data from the file diff --git a/pipeline/download_step1/cbioportal/find_duplicates.sh b/pipeline/helper_scripts/find_duplicates.sh similarity index 100% rename from pipeline/download_step1/cbioportal/find_duplicates.sh rename to pipeline/helper_scripts/find_duplicates.sh diff --git a/pipeline/download_step1/cbioportal/instructions.md b/pipeline/helper_scripts/instructions.md similarity index 100% rename from pipeline/download_step1/cbioportal/instructions.md rename to pipeline/helper_scripts/instructions.md diff --git a/pipeline/download_step1/cbioportal/instructions.py b/pipeline/helper_scripts/instructions.py similarity index 100% rename from pipeline/download_step1/cbioportal/instructions.py rename to pipeline/helper_scripts/instructions.py From 16e9007f83503a0164675b71ff972e4a38d0f20f Mon Sep 17 00:00:00 2001 From: mariacuria Date: Wed, 27 Nov 2024 14:18:16 -0500 Subject: [PATCH 5/7] Upd gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index c02bc7f..a4ae9f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ generated_datasets +pipeline/convert_step2/liftover/annotations.pkl pipeline/convert_step2/liftover/Homo_sapiens.GRCh38.113.gff3 +utils/__pycache__/ .*/ ../downloads *.log From ca0a660c01c9a202213fd92461eeb869a940e8c3 Mon Sep 17 00:00:00 2001 From: Reeya Gupta Date: Tue, 3 Dec 2024 12:46:44 -0500 Subject: [PATCH 6/7] updated scripts with paths dynamically fetched from config.json --- pipeline/config.json | 4 ++- .../cbioportal/1_generate_cancer_do_json.py | 28 +++++++++++-------- .../cbioportal/2_ensp_to_uniprot_batch.sh | 8 ++++-- .../cbioportal/3_canonical_yes_no.py | 19 +++++++++++-- .../cbioportal/4_compare_fasta.py | 17 ++++++++--- pipeline/convert_step2/liftover/2_liftover.sh | 21 ++++++++------ .../cbioportal/fetch_mutations_old.sh | 12 +++++--- .../cbioportal/find_incomplete.py | 22 +++++++++++++-- .../cbioportal/integrate_cancer_types.sh | 22 ++++++++++----- 9 files changed, 109 insertions(+), 44 deletions(-) diff --git a/pipeline/config.json b/pipeline/config.json index 69115d9..cc1bac0 100644 --- a/pipeline/config.json +++ b/pipeline/config.json @@ -1,5 +1,7 @@ { "relevant_paths": { + "repos_generated_datasets": "/data/shared/repos/biomuta-old/generated_datasets", + "repos_downloads": "/data/shared/repos/biomuta-old/downloads", "downloads": "/data/shared/biomuta/downloads", "generated_datasets": "/data/shared/biomuta/generated/datasets", "mapping": "/data/shared/biomuta/pipeline/convert_step2/mapping" @@ -7,4 +9,4 @@ "resource_init": { "cbioportal": ["subfolder1", "subfolder2"] } -} \ No newline at end of file +} diff --git a/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py b/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py index b8f487a..c78ed1d 100644 --- a/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py +++ b/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py @@ -5,12 +5,12 @@ from pathlib import Path from typing import Optional +# Configure logging logging.basicConfig(filename="cancer_mapping.log", filemode='a', format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO) - # Logging levels # 1. error # 2. warning @@ -27,24 +27,27 @@ #from utils import ROOT_DIR #__init__.py, ROOT_DIR is a global var # Define paths - # Get the directory of this script script_dir = Path(__file__).resolve().parent # Navigate to config.json location relative to script config_dir = script_dir.parent.parent -# Load config -with open(config_dir/'config.json') as config_file: + +# Load config.json +with open(config_dir / 'config.json') as config_file: config = json.load(config_file) + # Access paths from config + mapping_dir = Path(config["relevant_paths"]["mapping"]) doid_mapping = mapping_dir / "combined_do_mapping.json" fallback_do_map = mapping_dir / "fallback_cbio_doid_mapping.json" # Input and output file names -# Get the latest directory -directory_path = Path(config["relevant_paths"]["generated_datasets"]) -latest_dir = max([d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))], key=lambda d: os.path.getctime(os.path.join(directory_path, d))) -latest_dir = Path(directory_path) / latest_dir +# Get the latest directory in generated_datasets +generated_datasets_dir = Path(config["relevant_paths"]["generated_datasets"]) +latest_dir = max([d for d in os.listdir(generated_datasets_dir) if os.path.isdir(os.path.join(generated_datasets_dir, d))], key=lambda d: os.path.getctime(os.path.join(generated_datasets_dir, d))) +latest_dir = Path(generated_datasets_dir) / latest_dir + def ask_confirmation(prompt): while True: user_input = input(f"{prompt} (y/n): ").strip().lower() @@ -54,11 +57,12 @@ def ask_confirmation(prompt): return False else: print(f"Invalid input. Please enter 'y' for yes or 'n' for no.") + if ask_confirmation(f"The latest created directory is: {latest_dir}. Proceed?"): - input_file = Path(latest_dir) / "unique_cancer_names.txt" - cancer_types_with_do = Path(latest_dir) / "cancer_types_with_do.json" - cancer_type_per_study = Path(latest_dir) / "cancer_type_per_study.json" - study_ids_with_do = Path(latest_dir) / "study_ids_with_do.json" + input_file = latest_dir / "unique_cancer_names.txt" + cancer_types_with_do = latest_dir / "cancer_types_with_do.json" + cancer_type_per_study = latest_dir / "cancer_type_per_study.json" + study_ids_with_do = latest_dir / "study_ids_with_do.json" print(f"Using {latest_dir}/unique_cancer_names.txt and writing out to {latest_dir}/cancer_types_with_do.json") else: sys.exit("Aborted by user.") diff --git a/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh index c4dc21d..8339881 100755 --- a/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh +++ b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh @@ -1,8 +1,12 @@ #!/bin/bash +# Load config.json and extract paths +config_file="$(dirname "$(dirname "$(realpath "$0")")")/config.json" +repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$config_file") + # Input and output file paths -input_csv="/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv" # Input CSV file -output_json="/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json" # Output JSON file +input_csv="$repos_generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv" # Input CSV file +output_json="$repos_generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json" # Output JSON file batch_size=5000 # Number of ENSP IDs per batch (adjustable) diff --git a/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py index 492ba1a..dfee946 100644 --- a/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py +++ b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py @@ -1,12 +1,25 @@ import json import pandas as pd +from pathlib import Path + +# Load config.json +config_path = Path(__file__).resolve().parent.parent / "config.json" +with open(config_path, "r") as config_file: + config = json.load(config_file) + +# Retrieve paths from updated config +repos_generated_datasets = Path(config["relevant_paths"]["repos_generated_datasets"]) +repos_downloads = Path(config["relevant_paths"]["repos_downloads"]) +isoform_data_path = repos_downloads / "glygen/human_protein_masterlist.csv" +ensp_to_uniprot_path = repos_generated_datasets / "2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json" +canonical_toy_output_path = repos_generated_datasets / "2024_10_22/mapping_ids/canonical_toy.json" # Load the ENSP to UniProt mapping JSON -with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json", "r") as f: +with open(ensp_to_uniprot_path, "r") as f: ensp_to_uniprot = json.load(f) # Load the isoform data CSV -isoform_data = pd.read_csv("/data/shared/repos/biomuta-old/downloads/glygen/human_protein_masterlist.csv") +isoform_data = pd.read_csv(isoform_data_path) # Prepare a dictionary to store the results result = {} @@ -42,5 +55,5 @@ def strip_suffix(identifier): break # Exit inner loop once the first match is found # Write the result to a JSON file -with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json", "w") as json_file: +with open(canonical_toy_output_path, "w") as json_file: json.dump(result, json_file, indent=4) diff --git a/pipeline/convert_step2/cbioportal/4_compare_fasta.py b/pipeline/convert_step2/cbioportal/4_compare_fasta.py index a59ecb0..3538615 100644 --- a/pipeline/convert_step2/cbioportal/4_compare_fasta.py +++ b/pipeline/convert_step2/cbioportal/4_compare_fasta.py @@ -1,13 +1,22 @@ import requests import json +from pathlib import Path + +# Load config.json +config_path = Path(__file__).resolve().parent.parent / "config.json" +with open(config_path, "r") as config_file: + config = json.load(config_file) + +# Retrieve paths from config.json +repos_base = Path(config["relevant_paths"]["repos"]) +ensembl_uniprot_map_path = repos_base / "2024_10_22/mapping_ids/canonical_toy.json" # Load your JSON file containing ENSEMBL to UniProt mappings -with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json") as file: +with open(ensembl_uniprot_map_path, "r") as file: ensembl_uniprot_map = json.load(file) -def fetch_ensembl_sequence(ensembl_id): #it works +def fetch_ensembl_sequence(ensembl_id): # Fetch ENSEMBL sequence url = f"https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/plain" - #https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/x-fasta" response = requests.get(url) if response.status_code == 200: return response.text.strip() @@ -15,7 +24,7 @@ def fetch_ensembl_sequence(ensembl_id): #it works print(f"Failed to fetch ENSEMBL sequence for {ensembl_id}") return None -def fetch_uniprot_sequence(uniprot_id): #it works +def fetch_uniprot_sequence(uniprot_id): # Fetch UniProt sequence url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta" response = requests.get(url) if response.status_code == 200: diff --git a/pipeline/convert_step2/liftover/2_liftover.sh b/pipeline/convert_step2/liftover/2_liftover.sh index 1f2617f..f25715b 100755 --- a/pipeline/convert_step2/liftover/2_liftover.sh +++ b/pipeline/convert_step2/liftover/2_liftover.sh @@ -1,9 +1,12 @@ #!/bin/bash -wd="/data/shared/biomuta/generated/datasets/2024_10_22/liftover" +# Load paths from config.json using jq +config_file="/path/to/config.json" # Replace with the actual path to your config.json +generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $config_file) +liftover_dir="${generated_datasets}/2024_10_22/liftover" # Extract rows with GRCh38 and save as tab-separated: -awk '$5 == "GRCh38"' ${wd}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${wd}/cbio_hg38.bed +awk '$5 == "GRCh38"' ${liftover_dir}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${liftover_dir}/cbio_hg38.bed # Check if the extraction and modification were successful if [ $? -ne 0 ]; then echo "Error extracting or modifying rows for GRCh38." @@ -11,7 +14,7 @@ if [ $? -ne 0 ]; then fi # Extract all other rows (where the 5th column is not GRCh38) and save as tab-separated: -awk '$5 != "GRCh38"' ${wd}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${wd}/hg19entrez_protChange.bed +awk '$5 != "GRCh38"' ${liftover_dir}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${liftover_dir}/hg19entrez_protChange.bed # Check if the extraction and modification were successful if [ $? -ne 0 ]; then echo "Error extracting or modifying non-GRCh38 rows." @@ -19,7 +22,7 @@ if [ $? -ne 0 ]; then fi # Run liftOver to convert coordinates (first chain) -./liftOver ${wd}/hg19entrez_protChange.bed ucscHg19ToHg38.over.chain ${wd}/ucsc_hg38entrez_protChange.bed ${wd}/ucsc_unmapped_entrez_protChange.bed +./liftOver ${liftover_dir}/hg19entrez_protChange.bed ucscHg19ToHg38.over.chain ${liftover_dir}/ucsc_hg38entrez_protChange.bed ${liftover_dir}/ucsc_unmapped_entrez_protChange.bed # Check if the first liftOver was successful if [ $? -ne 0 ]; then @@ -28,7 +31,7 @@ if [ $? -ne 0 ]; then fi # Run liftOver to convert coordinates (second chain) -./liftOver ${wd}/ucsc_unmapped_entrez_protChange.bed ensembl_GRCh37_to_GRCh38.chain ${wd}/ensembl_hg38entrez_protChange.bed ${wd}/ensembl_unmapped_entrez_protChange.bed +./liftOver ${liftover_dir}/ucsc_unmapped_entrez_protChange.bed ensembl_GRCh37_to_GRCh38.chain ${liftover_dir}/ensembl_hg38entrez_protChange.bed ${liftover_dir}/ensembl_unmapped_entrez_protChange.bed # Check if the second liftOver was successful if [ $? -ne 0 ]; then @@ -37,14 +40,14 @@ if [ $? -ne 0 ]; then fi # Prepend 'chr' to the 1st column of ensembl_hg38entrez_protChange.bed -sed 's/^\([a-zA-Z0-9]*\)/chr\1/' ${wd}/ensembl_hg38entrez_protChange.bed > ${wd}/temp && mv ${wd}/temp ${wd}/ensembl_hg38entrez_protChange.bed +sed 's/^\([a-zA-Z0-9]*\)/chr\1/' ${liftover_dir}/ensembl_hg38entrez_protChange.bed > ${liftover_dir}/temp && mv ${liftover_dir}/temp ${liftover_dir}/ensembl_hg38entrez_protChange.bed # Combine all hg38 files -cat ${wd}/cbio_hg38.bed ${wd}/ucsc_hg38entrez_protChange.bed ${wd}/ensembl_hg38entrez_protChange.bed > ${wd}/hg38_combined.bed +cat ${liftover_dir}/cbio_hg38.bed ${liftover_dir}/ucsc_hg38entrez_protChange.bed ${liftover_dir}/ensembl_hg38entrez_protChange.bed > ${liftover_dir}/hg38_combined.bed # Remove duplicate rows taking into account extra tabs -awk -v OFS='\t' '{$1=$1; print}' ${wd}/hg38_combined.bed | sort -u > ${wd}/temp && mv ${wd}/temp ${wd}/hg38_combined.bed +awk -v OFS='\t' '{$1=$1; print}' ${liftover_dir}/hg38_combined.bed | sort -u > ${liftover_dir}/temp && mv ${liftover_dir}/temp ${liftover_dir}/hg38_combined.bed # Add headers with tab separation -sed -i '1i chr_id\tstart_pos\tend_pos\tentrez_gene_id\tprot_change' ${wd}/hg38_combined.bed +sed -i '1i chr_id\tstart_pos\tend_pos\tentrez_gene_id\tprot_change' ${liftover_dir}/hg38_combined.bed echo "Script completed successfully." diff --git a/pipeline/download_step1/cbioportal/fetch_mutations_old.sh b/pipeline/download_step1/cbioportal/fetch_mutations_old.sh index fc0291c..4175254 100755 --- a/pipeline/download_step1/cbioportal/fetch_mutations_old.sh +++ b/pipeline/download_step1/cbioportal/fetch_mutations_old.sh @@ -1,17 +1,21 @@ #!/bin/bash +# Load paths from config.json +CONFIG_FILE="/path/to/config.json" +DOWNLOADS_DIR=$(jq -r '.relevant_paths.generated_datasets' "$CONFIG_FILE") + # Today's date TODAY=$(date +"%Y_%m_%d") -# Download directory -OUTPUT_DIR="/data/shared/biomuta/generated/datasets/${TODAY}" +# Output directory +OUTPUT_DIR="${DOWNLOADS_DIR}/${TODAY}" mkdir -p "${OUTPUT_DIR}/mutations" # Base URLs for the API STUDY_URL="https://www.cbioportal.org/api/studies" MUTATIONS_URL="https://www.cbioportal.org/api/molecular-profiles" -#get study IDs +# Get study IDs curl -G "${STUDY_URL}" \ -H "accept: application/json" \ -o "${OUTPUT_DIR}/all_studies.json" @@ -82,4 +86,4 @@ while IFS= read -r study_id; do done done -done < "$STUDY_IDS_FILE" \ No newline at end of file +done < "$STUDY_IDS_FILE" diff --git a/pipeline/download_step1/cbioportal/find_incomplete.py b/pipeline/download_step1/cbioportal/find_incomplete.py index a913a02..e59b0bf 100644 --- a/pipeline/download_step1/cbioportal/find_incomplete.py +++ b/pipeline/download_step1/cbioportal/find_incomplete.py @@ -1,7 +1,16 @@ import os import ijson +import json +from pathlib import Path def find_incomplete_json_files(directory, output_file="incomplete_files.txt"): + """ + Function to find incomplete or corrupted JSON files in the specified directory. + + Args: + directory (str): Directory containing JSON files. + output_file (str): File to save the names of incomplete JSON files. + """ # List to store the names of incomplete files incomplete_files = [] @@ -25,8 +34,17 @@ def find_incomplete_json_files(directory, output_file="incomplete_files.txt"): print(f"Found {len(incomplete_files)} incomplete files. Results saved to '{output_file}'.") -# Specify the directory containing your JSON files -directory_path = "/data/shared/biomuta/downloads/cbioportal/2024_10_21/mutationss" +# Load config.json +config_path = Path(__file__).resolve().parent.parent / "config.json" +with open(config_path, "r") as config_file: + config = json.load(config_file) + +# Get base directory from config +downloads_base = Path(config["relevant_paths"]["downloads"]) / "cbioportal /2024_10_21" + + +directory_path = downloads_base / "mutations" + # Run the function find_incomplete_json_files(directory_path) diff --git a/pipeline/download_step1/cbioportal/integrate_cancer_types.sh b/pipeline/download_step1/cbioportal/integrate_cancer_types.sh index 85bc880..62f2039 100644 --- a/pipeline/download_step1/cbioportal/integrate_cancer_types.sh +++ b/pipeline/download_step1/cbioportal/integrate_cancer_types.sh @@ -1,12 +1,18 @@ #!/bin/bash -# This script extracts study IDs and the corresponding cancer names. It takes as input the output of cancer_types.sh +# Load the config file to dynamically retrieve paths +config_file="path/to/config.json" -# Define the directory where your JSON files are located -input_dir="/data/shared/biomuta/downloads/cbioportal/2024_10_21/cancer_types" +# Get paths from config +input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$config_file") +output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$config_file") -# Define the output TSV file -output_file="/data/shared/biomuta/generated/datasets/2024_10_22/cancer_type_per_study.json" +# Define the output files +output_file="$output_dir/cancer_type_per_study.json" +unique_cancer_names_file="$output_dir/unique_cancer_names.json" + +# Create the output directory if it doesn't exist +mkdir -p "$output_dir" # Initialize the JSON array echo "[" > "$output_file" @@ -42,5 +48,7 @@ echo "]" >> "$output_file" echo "Data successfully written to $output_file" -# Make a list of unique cancer names in json format -jq -r '.[].cancerType' $output_file | sort | uniq | jq -R . | jq -s . > unique_cancer_names.json \ No newline at end of file +# Make a list of unique cancer names in JSON format +jq -r '.[].cancerType' "$output_file" | sort | uniq | jq -R . | jq -s . > "$unique_cancer_names_file" + +echo "Unique cancer names written to $unique_cancer_names_file" From 48d1a1ea64db2b2743c9e428b43894395f8bde0e Mon Sep 17 00:00:00 2001 From: Reeya Gupta Date: Wed, 4 Dec 2024 13:12:05 -0500 Subject: [PATCH 7/7] Updated paths to config file --- .../cbioportal/2_ensp_to_uniprot_batch.sh | 8 ++-- .../cbioportal/3_canonical_yes_no.py | 2 +- .../cbioportal/4_compare_fasta.py | 4 +- .../liftover/1_chr_pos_to_bed.py | 40 ++++++++++--------- pipeline/convert_step2/liftover/2_liftover.sh | 9 +++-- .../cbioportal/fetch_mutations_old.sh | 6 ++- .../cbioportal/find_incomplete.py | 2 +- .../cbioportal/integrate_cancer_types.sh | 12 ++++-- 8 files changed, 49 insertions(+), 34 deletions(-) diff --git a/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh index 8339881..432dc1e 100755 --- a/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh +++ b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh @@ -1,8 +1,10 @@ #!/bin/bash -# Load config.json and extract paths -config_file="$(dirname "$(dirname "$(realpath "$0")")")/config.json" -repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$config_file") +# Get this script's directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Load config.json +CONFIG_FILE="$SCRIPT_DIR/../../config.json" +repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$CONFIG_FILE") # Input and output file paths input_csv="$repos_generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv" # Input CSV file diff --git a/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py index dfee946..e1a055d 100644 --- a/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py +++ b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py @@ -3,7 +3,7 @@ from pathlib import Path # Load config.json -config_path = Path(__file__).resolve().parent.parent / "config.json" +config_path = Path(__file__).resolve().parent.parent.parent / "config.json" with open(config_path, "r") as config_file: config = json.load(config_file) diff --git a/pipeline/convert_step2/cbioportal/4_compare_fasta.py b/pipeline/convert_step2/cbioportal/4_compare_fasta.py index 3538615..9e385c1 100644 --- a/pipeline/convert_step2/cbioportal/4_compare_fasta.py +++ b/pipeline/convert_step2/cbioportal/4_compare_fasta.py @@ -3,12 +3,12 @@ from pathlib import Path # Load config.json -config_path = Path(__file__).resolve().parent.parent / "config.json" +config_path = Path(__file__).resolve().parent.parent.parent / "config.json" with open(config_path, "r") as config_file: config = json.load(config_file) # Retrieve paths from config.json -repos_base = Path(config["relevant_paths"]["repos"]) +repos_base = Path(config["relevant_paths"]["repos_generated_datasets"]) ensembl_uniprot_map_path = repos_base / "2024_10_22/mapping_ids/canonical_toy.json" # Load your JSON file containing ENSEMBL to UniProt mappings diff --git a/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py b/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py index a976bc3..86ef4a7 100644 --- a/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py +++ b/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py @@ -16,17 +16,27 @@ #GRCh37: 30925146 #NA: 3143 #GRCh38: 6253932 - import os import json import glob import sys from pathlib import Path -sys.path.append(str(Path(__file__).resolve().parent.parent.parent.parent)) -from utils import ROOT_DIR -from utils.config import get_config +# Load config.json directly and ensure paths are resolved correctly +CONFIG_FILE = Path(__file__).resolve().parent.parent.parent / "config.json" +if not CONFIG_FILE.exists(): + raise FileNotFoundError(f"Config file not found at {CONFIG_FILE}") + +with open(CONFIG_FILE, "r") as config_file: + config = json.load(config_file) +# Retrieve paths from config +downloads_dir = Path(config["relevant_paths"]["downloads"]) +generated_datasets_dir = Path(config["relevant_paths"]["generated_datasets"]) + +# Define input and output directories +input_directory = downloads_dir / "cbioportal" / "2024_10_21" / "mutations" +output_bed_file = generated_datasets_dir / "2024_10_22" / "liftover" / "hg19entrez_build_protChange.bed" def process_json_to_bed(input_directory, output_bed_file): buffer = [] @@ -49,26 +59,25 @@ def process_json_to_bed(input_directory, output_bed_file): # Check genome build and SNP criteria if record.get('ncbiBuild') == 'NA': continue - if record.get('variantType') != 'SNP': #take SNPs only + if record.get('variantType') != 'SNP': # Take SNPs only continue - if record['endPosition'] - record['startPosition'] != 0: #additional check to confirm SNP + if record['endPosition'] - record['startPosition'] != 0: # Additional check to confirm SNP continue - if 'splice' in record.get('proteinChange', ''): #no splice site mutations + if 'splice' in record.get('proteinChange', ''): # No splice site mutations continue # Extract chromosome, start position, and end position - chr_ = record['chr'] # Convert specific chromosome values and exclude unwanted chromosomes - if chr_ == '23': + if chr_ == '23': chr_ = 'X' if chr_ == '24': chr_ = 'Y' if chr_ in ['MT', 'NA']: - continue # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values + continue # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values if not chr_.startswith('chr'): chr_ = 'chr' + chr_ - start_pos = record['startPosition'] - 1 # Convert to 0-based for BED + start_pos = record['startPosition'] - 1 # Convert to 0-based for BED end_pos = record['endPosition'] entrez_gene_id = record['entrezGeneId'] ncbi_build = record['ncbiBuild'] @@ -77,10 +86,10 @@ def process_json_to_bed(input_directory, output_bed_file): row = f"{chr_}\t{start_pos}\t{end_pos}\t{entrez_gene_id}\t{ncbi_build}\t{protein_change}\n" if row not in unique_rows: # Only add unique rows unique_rows.add(row) - buffer.append(row) # Append row to buffer + buffer.append(row) # Append row to buffer # Write buffer to file when it reaches the specified size - if len(buffer) >= buffer_size: + if len(buffer) >= buffer_size: bed_file.writelines(buffer) buffer.clear() @@ -110,9 +119,4 @@ def process_json_to_bed(input_directory, output_bed_file): # Run the function -config_obj = get_config() -dl_dir = Path(config_obj["relevant_paths"]["downloads"]) -out_dir = Path(config_obj["relevant_paths"]["generated_datasets"]) -input_directory = dl_dir / 'cbioportal' / '2024_10_21' / 'mutations' # Write a util to get latest dir -output_bed_file = out_dir / '2024_10_22' / 'liftover' / 'hg19entrez_build_protChange.bed' #Write a util to get latest dir process_json_to_bed(input_directory, output_bed_file) diff --git a/pipeline/convert_step2/liftover/2_liftover.sh b/pipeline/convert_step2/liftover/2_liftover.sh index f25715b..787e354 100755 --- a/pipeline/convert_step2/liftover/2_liftover.sh +++ b/pipeline/convert_step2/liftover/2_liftover.sh @@ -1,8 +1,11 @@ #!/bin/bash -# Load paths from config.json using jq -config_file="/path/to/config.json" # Replace with the actual path to your config.json -generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $config_file) +# Get this script's directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Load config.json +CONFIG_FILE="$SCRIPT_DIR/../../config.json" + +generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $CONFIG_FILE) liftover_dir="${generated_datasets}/2024_10_22/liftover" # Extract rows with GRCh38 and save as tab-separated: diff --git a/pipeline/download_step1/cbioportal/fetch_mutations_old.sh b/pipeline/download_step1/cbioportal/fetch_mutations_old.sh index 4175254..90c9eea 100755 --- a/pipeline/download_step1/cbioportal/fetch_mutations_old.sh +++ b/pipeline/download_step1/cbioportal/fetch_mutations_old.sh @@ -1,7 +1,9 @@ #!/bin/bash -# Load paths from config.json -CONFIG_FILE="/path/to/config.json" +# Get this script's directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Load config.json +CONFIG_FILE="$SCRIPT_DIR/../../config.json" DOWNLOADS_DIR=$(jq -r '.relevant_paths.generated_datasets' "$CONFIG_FILE") # Today's date diff --git a/pipeline/download_step1/cbioportal/find_incomplete.py b/pipeline/download_step1/cbioportal/find_incomplete.py index e59b0bf..c509690 100644 --- a/pipeline/download_step1/cbioportal/find_incomplete.py +++ b/pipeline/download_step1/cbioportal/find_incomplete.py @@ -35,7 +35,7 @@ def find_incomplete_json_files(directory, output_file="incomplete_files.txt"): print(f"Found {len(incomplete_files)} incomplete files. Results saved to '{output_file}'.") # Load config.json -config_path = Path(__file__).resolve().parent.parent / "config.json" +config_path = Path(__file__).resolve().parent.parent.parent / "config.json" with open(config_path, "r") as config_file: config = json.load(config_file) diff --git a/pipeline/download_step1/cbioportal/integrate_cancer_types.sh b/pipeline/download_step1/cbioportal/integrate_cancer_types.sh index 62f2039..5186ba4 100644 --- a/pipeline/download_step1/cbioportal/integrate_cancer_types.sh +++ b/pipeline/download_step1/cbioportal/integrate_cancer_types.sh @@ -1,11 +1,15 @@ #!/bin/bash -# Load the config file to dynamically retrieve paths -config_file="path/to/config.json" + +# Get this script's directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Load config.json +CONFIG_FILE="$SCRIPT_DIR/../../config.json" + # Get paths from config -input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$config_file") -output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$config_file") +input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$CONFIG_FILE") +output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$CONFIG_FILE") # Define the output files output_file="$output_dir/cancer_type_per_study.json"