From e345f6bdf8db72b4bc8a13452af9dc1486db42a2 Mon Sep 17 00:00:00 2001
From: mariacuria <marianatkim-sch@outlook.com>
Date: Tue, 26 Nov 2024 14:53:47 -0500
Subject: [PATCH 1/7] Tidy up directory

---
 .../convert_step2/cbioportal/{canonical.py => 2_canonical.py}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pipeline/convert_step2/cbioportal/{canonical.py => 2_canonical.py} (100%)

diff --git a/pipeline/convert_step2/cbioportal/canonical.py b/pipeline/convert_step2/cbioportal/2_canonical.py
similarity index 100%
rename from pipeline/convert_step2/cbioportal/canonical.py
rename to pipeline/convert_step2/cbioportal/2_canonical.py

From 2da1abeaa972fccee41edc21c5d7d0ef2c1de00d Mon Sep 17 00:00:00 2001
From: mariacuria <marianatkim-sch@outlook.com>
Date: Wed, 27 Nov 2024 10:18:06 -0500
Subject: [PATCH 2/7] Compare fasta sequences initial commit

---
 .../convert_step2/cbioportal/2_canonical.py   |  4 +-
 .../convert_step2/cbioportal/compare_fasta.py | 38 +++++++++++++++++++
 .../cbioportal/compare_fasta_test.py          | 12 ++++++
 3 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 pipeline/convert_step2/cbioportal/compare_fasta.py
 create mode 100644 pipeline/convert_step2/cbioportal/compare_fasta_test.py

diff --git a/pipeline/convert_step2/cbioportal/2_canonical.py b/pipeline/convert_step2/cbioportal/2_canonical.py
index 8fb9d95..3149272 100644
--- a/pipeline/convert_step2/cbioportal/2_canonical.py
+++ b/pipeline/convert_step2/cbioportal/2_canonical.py
@@ -2,7 +2,7 @@
 import pandas as pd
 
 # Load the ENSP to UniProt mapping JSON
-with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json", "r") as f:
+with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json", "r") as f:
     ensp_to_uniprot = json.load(f)
 
 # Load the isoform data CSV
@@ -34,5 +34,5 @@ def strip_suffix(identifier):
             break  # Exit inner loop once the first match is found
 
 # Write the result to a JSON file
-with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical.json", "w") as json_file:
+with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json", "w") as json_file:
     json.dump(result, json_file, indent=4)
diff --git a/pipeline/convert_step2/cbioportal/compare_fasta.py b/pipeline/convert_step2/cbioportal/compare_fasta.py
new file mode 100644
index 0000000..a59ecb0
--- /dev/null
+++ b/pipeline/convert_step2/cbioportal/compare_fasta.py
@@ -0,0 +1,38 @@
+import requests
+import json
+
+# Load your JSON file containing ENSEMBL to UniProt mappings
+with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json") as file:
+    ensembl_uniprot_map = json.load(file)
+
+def fetch_ensembl_sequence(ensembl_id): #it works
+    url = f"https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/plain"
+    #https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/x-fasta"
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.text.strip()
+    else:
+        print(f"Failed to fetch ENSEMBL sequence for {ensembl_id}")
+        return None
+
+def fetch_uniprot_sequence(uniprot_id): #it works
+    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.text.split('\n', 1)[1].replace('\n', '')
+    else:
+        print(f"Failed to fetch UniProt sequence for {uniprot_id}")
+        return None
+
+# Compare sequences
+for ensembl_id, uniprot_id in ensembl_uniprot_map.items():
+    ensembl_sequence = fetch_ensembl_sequence(ensembl_id)
+    uniprot_sequence = fetch_uniprot_sequence(uniprot_id)
+    
+    if ensembl_sequence and uniprot_sequence:
+        if ensembl_sequence == uniprot_sequence:
+            print(f"Sequences match for {ensembl_id} and {uniprot_id}")
+        else:
+            print(f"Sequences do not match for {ensembl_id} and {uniprot_id}")
+    else:
+        print(f"Could not compare sequences for {ensembl_id} and {uniprot_id}")
diff --git a/pipeline/convert_step2/cbioportal/compare_fasta_test.py b/pipeline/convert_step2/cbioportal/compare_fasta_test.py
new file mode 100644
index 0000000..237daee
--- /dev/null
+++ b/pipeline/convert_step2/cbioportal/compare_fasta_test.py
@@ -0,0 +1,12 @@
+import requests
+
+def fetch_uniprot_sequence(uniprot_id):
+    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.text.split('\n', 1)[1].replace('\n', '')
+    else:
+        print(f"Failed to fetch UniProt sequence for {uniprot_id}")
+        return None
+    
+print(fetch_uniprot_sequence('Q9NXB0'))
\ No newline at end of file

From 56b0686bf8f5b7f45c1481af37d3ae998da98b15 Mon Sep 17 00:00:00 2001
From: mariacuria <marianatkim-sch@outlook.com>
Date: Wed, 27 Nov 2024 11:24:49 -0500
Subject: [PATCH 3/7] Write whether or not a given ENSP is canonical

---
 .../cbioportal/2_canonical_yes_no.py          | 46 +++++++++++++++++++
 .../{compare_fasta.py => 3_compare_fasta.py}  |  0
 2 files changed, 46 insertions(+)
 create mode 100644 pipeline/convert_step2/cbioportal/2_canonical_yes_no.py
 rename pipeline/convert_step2/cbioportal/{compare_fasta.py => 3_compare_fasta.py} (100%)

diff --git a/pipeline/convert_step2/cbioportal/2_canonical_yes_no.py b/pipeline/convert_step2/cbioportal/2_canonical_yes_no.py
new file mode 100644
index 0000000..492ba1a
--- /dev/null
+++ b/pipeline/convert_step2/cbioportal/2_canonical_yes_no.py
@@ -0,0 +1,46 @@
+import json
+import pandas as pd
+
+# Load the ENSP to UniProt mapping JSON
+with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json", "r") as f:
+    ensp_to_uniprot = json.load(f)
+
+# Load the isoform data CSV
+isoform_data = pd.read_csv("/data/shared/repos/biomuta-old/downloads/glygen/human_protein_masterlist.csv")
+
+# Prepare a dictionary to store the results
+result = {}
+
+# Function to strip suffixes (anything after a hyphen) from both isoform IDs and UniProtKB Canonical ACs
+def strip_suffix(identifier):
+    if isinstance(identifier, str) and '-' in identifier:
+        return identifier.split('-')[0]  # Strip everything after the first hyphen
+    return identifier
+
+# Iterate over each ENSP and its corresponding UniProt ID
+for ensp, uniprot in ensp_to_uniprot.items():
+    # Default to "no" for canonical until proven otherwise
+    is_canonical = "no"
+    
+    # Check for matching UniProt IDs in either reviewed_isoforms or unreviewed_isoforms
+    for _, entry in isoform_data.iterrows():
+        # Strip suffixes from isoform IDs before comparison
+        reviewed_isoforms = strip_suffix(entry.get("reviewed_isoforms", ""))
+        unreviewed_isoforms = strip_suffix(entry.get("unreviewed_isoforms", ""))
+        
+        # Check if the UniProt ID matches any isoform (stripped version)
+        if uniprot == reviewed_isoforms or uniprot == unreviewed_isoforms:
+            # If a match is found, add the uniprotkb_canonical_ac to the result
+            uniprotkb_ac = strip_suffix(entry.get("uniprotkb_canonical_ac"))
+            
+            # Check if the UniProt ID matches the canonical version
+            if uniprot == uniprotkb_ac:
+                is_canonical = "yes"
+            
+            # Store the result with canonical status
+            result[ensp] = {"uniprotkb_canonical_ac": uniprotkb_ac, "canonical": is_canonical}
+            break  # Exit inner loop once the first match is found
+
+# Write the result to a JSON file
+with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json", "w") as json_file:
+    json.dump(result, json_file, indent=4)
diff --git a/pipeline/convert_step2/cbioportal/compare_fasta.py b/pipeline/convert_step2/cbioportal/3_compare_fasta.py
similarity index 100%
rename from pipeline/convert_step2/cbioportal/compare_fasta.py
rename to pipeline/convert_step2/cbioportal/3_compare_fasta.py

From a077d36187f56dec1de9ad357b14d102d95c0aea Mon Sep 17 00:00:00 2001
From: mariacuria <marianatkim-sch@outlook.com>
Date: Wed, 27 Nov 2024 14:15:54 -0500
Subject: [PATCH 4/7] Moved scripts to proper directories

---
 .../cbioportal/1_generate_cancer_do_json.py}  |   0
 .../convert_step2/cbioportal/2_canonical.py   |  38 ------
 ...ot_batch.sh => 2_ensp_to_uniprot_batch.sh} |   0
 ...onical_yes_no.py => 3_canonical_yes_no.py} |   0
 ...{3_compare_fasta.py => 4_compare_fasta.py} |   0
 .../liftover/3_get_ensp_ver2_toy.py           | 112 ++++++++++++++++++
 .../cbioportal/find_incomplete.py             |   2 +-
 .../call_config_with_py.txt                   |   0
 .../cbioportal => helper_scripts}/cleanup.sh  |   0
 .../combined_mapping.py                       |   0
 .../extract_uniq_do_names.py                  |   2 +-
 .../find_duplicates.sh                        |   0
 .../instructions.md                           |   0
 .../instructions.py                           |   0
 14 files changed, 114 insertions(+), 40 deletions(-)
 rename pipeline/{download_step1/cbioportal/generate_cancer_do_json.py => convert_step2/cbioportal/1_generate_cancer_do_json.py} (100%)
 delete mode 100644 pipeline/convert_step2/cbioportal/2_canonical.py
 rename pipeline/convert_step2/cbioportal/{1_ensp_to_uniprot_batch.sh => 2_ensp_to_uniprot_batch.sh} (100%)
 rename pipeline/convert_step2/cbioportal/{2_canonical_yes_no.py => 3_canonical_yes_no.py} (100%)
 rename pipeline/convert_step2/cbioportal/{3_compare_fasta.py => 4_compare_fasta.py} (100%)
 create mode 100644 pipeline/convert_step2/liftover/3_get_ensp_ver2_toy.py
 rename pipeline/{download_step1/cbioportal => helper_scripts}/call_config_with_py.txt (100%)
 rename pipeline/{download_step1/cbioportal => helper_scripts}/cleanup.sh (100%)
 rename pipeline/{download_step1/cbioportal => helper_scripts}/combined_mapping.py (100%)
 rename pipeline/{download_step1/cbioportal => helper_scripts}/extract_uniq_do_names.py (85%)
 rename pipeline/{download_step1/cbioportal => helper_scripts}/find_duplicates.sh (100%)
 rename pipeline/{download_step1/cbioportal => helper_scripts}/instructions.md (100%)
 rename pipeline/{download_step1/cbioportal => helper_scripts}/instructions.py (100%)

diff --git a/pipeline/download_step1/cbioportal/generate_cancer_do_json.py b/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py
similarity index 100%
rename from pipeline/download_step1/cbioportal/generate_cancer_do_json.py
rename to pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py
diff --git a/pipeline/convert_step2/cbioportal/2_canonical.py b/pipeline/convert_step2/cbioportal/2_canonical.py
deleted file mode 100644
index 3149272..0000000
--- a/pipeline/convert_step2/cbioportal/2_canonical.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import json
-import pandas as pd
-
-# Load the ENSP to UniProt mapping JSON
-with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json", "r") as f:
-    ensp_to_uniprot = json.load(f)
-
-# Load the isoform data CSV
-isoform_data = pd.read_csv("/data/shared/repos/biomuta-old/downloads/glygen/human_protein_masterlist.csv")
-
-# Prepare a dictionary to store the results
-result = {}
-
-# Function to strip suffixes (anything after a hyphen) from both isoform IDs and UniProtKB Canonical ACs
-def strip_suffix(identifier):
-    if isinstance(identifier, str) and '-' in identifier:
-        return identifier.split('-')[0]  # Strip everything after the first hyphen
-    return identifier
-
-# Iterate over each ENSP and its corresponding UniProt ID
-for ensp, uniprot in ensp_to_uniprot.items():
-    # Check for matching UniProt IDs in either reviewed_isoforms or unreviewed_isoforms
-    for _, entry in isoform_data.iterrows():
-        # Strip suffixes from isoform IDs before comparison
-        reviewed_isoforms = strip_suffix(entry.get("reviewed_isoforms", ""))
-        unreviewed_isoforms = strip_suffix(entry.get("unreviewed_isoforms", ""))
-        
-        # Check if the UniProt ID matches any isoform (stripped version)
-        if uniprot == reviewed_isoforms or uniprot == unreviewed_isoforms:
-            # If a match is found, add the uniprotkb_canonical_ac to the result
-            uniprotkb_ac = strip_suffix(entry.get("uniprotkb_canonical_ac"))
-            # Store the first match found for each ENSP identifier
-            result[ensp] = uniprotkb_ac
-            break  # Exit inner loop once the first match is found
-
-# Write the result to a JSON file
-with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json", "w") as json_file:
-    json.dump(result, json_file, indent=4)
diff --git a/pipeline/convert_step2/cbioportal/1_ensp_to_uniprot_batch.sh b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
similarity index 100%
rename from pipeline/convert_step2/cbioportal/1_ensp_to_uniprot_batch.sh
rename to pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
diff --git a/pipeline/convert_step2/cbioportal/2_canonical_yes_no.py b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
similarity index 100%
rename from pipeline/convert_step2/cbioportal/2_canonical_yes_no.py
rename to pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
diff --git a/pipeline/convert_step2/cbioportal/3_compare_fasta.py b/pipeline/convert_step2/cbioportal/4_compare_fasta.py
similarity index 100%
rename from pipeline/convert_step2/cbioportal/3_compare_fasta.py
rename to pipeline/convert_step2/cbioportal/4_compare_fasta.py
diff --git a/pipeline/convert_step2/liftover/3_get_ensp_ver2_toy.py b/pipeline/convert_step2/liftover/3_get_ensp_ver2_toy.py
new file mode 100644
index 0000000..8d505c1
--- /dev/null
+++ b/pipeline/convert_step2/liftover/3_get_ensp_ver2_toy.py
@@ -0,0 +1,112 @@
+import csv
+import pickle
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent.parent))
+from utils import ROOT_DIR
+from utils.config import get_config
+
+# Load input_bed and ann
+# Set input directory for JSON files and output file for BED format
+config_obj = get_config()
+wd = Path(config_obj["relevant_paths"]["generated_datasets"])
+input_bed = wd / '2024_10_22' / 'liftover' / 'hg38_combined_toy.bed'  # Write a util to get latest dir
+ann_dir = Path(config_obj["relevant_paths"]["downloads"])
+ann = ann_dir / 'ensembl' / 'Homo_sapiens.GRCh38.113.gff3'  # GFF file with genomic features
+output_file = wd / '2024_10_22' / 'mapping_ids' / 'chr_pos_to_ensp_toy.csv'
+
+
+# Step 1: Load and Serialize 'ann' Annotations
+def parse_and_serialize_ann():
+    annotations = {}
+    print("Parsing annotations from ann...")
+    with open(ann, 'r') as f:
+        for line in f:
+            if not line.startswith('#'):
+                fields = line.strip().split('\t')
+                chrom = 'chr' + fields[0]  # Add 'chr' prefix to match format in input_bed
+                feature_start = int(fields[3])
+                feature_end = int(fields[4])
+                if chrom not in annotations:
+                    annotations[chrom] = []
+                annotations[chrom].append((feature_start, feature_end, line.strip()))
+    with open('annotations.pkl', 'wb') as f:
+        pickle.dump(annotations, f)
+    print("Serialized annotations to 'annotations.pkl'")
+
+# Step 2: Load 'ann' from Serialized File
+def load_annotations():
+    with open('annotations.pkl', 'rb') as f:
+        return pickle.load(f)
+
+# Step 3: Process 'input_bed' and Write Results in Batches
+def process_large_input_bed():
+    # Load serialized annotations
+    annotations = load_annotations()
+
+    # Open output CSV file
+    with open(output_file, 'w', newline='') as csvfile:
+        # Define the new headers for the output file
+        writer = csv.writer(csvfile)
+        writer.writerow(['chr_id', 'start_pos', 'end_pos', 'entrez_gene_id', 'prot_change', 'ENSP'])
+
+        batch = []
+        batch_size = 10000  # Define batch size for writing
+
+        print("Processing SNP positions from input_bed and writing to CSV...")
+
+        with open(input_bed, 'r') as f:
+            # Skip the header line (if the first line is a header)
+            header_skipped = False
+
+            for i, line in enumerate(f, start=1):
+                # Skip the header line
+                if not header_skipped:
+                    header_skipped = True
+                    continue  # Skip the header
+
+                fields = line.strip().split('\t')
+
+                # Check that the necessary fields are numeric before proceeding
+                try:
+                    start = int(fields[1])  # start_pos
+                    end = int(fields[2])    # end_pos
+                except ValueError:
+                    print(f"Skipping invalid line {i}: {line.strip()}")
+                    continue  # Skip lines where start or end position is not numeric
+
+                chrom = fields[0]          # chr_id
+                entrez = fields[3]         # entrez_gene_id
+                prot_change = fields[4]    # prot_change
+
+                # Find matching annotations
+                if chrom in annotations:
+                    for feature_start, feature_end, annotation in annotations[chrom]:
+                        if start >= feature_start and end <= feature_end:
+                            ensp = None
+                            for field in annotation.split(';'):
+                                if 'protein_id=' in field:
+                                    ensp = field.split('=')[1]
+                                    break
+                            if ensp:
+                                # Add match to batch with renamed fields
+                                batch.append([chrom, start, end, entrez, prot_change, ensp])
+
+                # Write batch to file every 'batch_size' records
+                if len(batch) >= batch_size:
+                    writer.writerows(batch)
+                    batch.clear()  # Clear batch after writing to file
+                    print(f"Processed {i} lines so far...")  # Status update
+
+            # Write remaining entries in the batch
+            if batch:
+                writer.writerows(batch)
+                print("Wrote remaining records to file.")
+
+    print(f"Process completed. Results written to {output_file}")
+
+
+# Run the workflow
+parse_and_serialize_ann()  # Run once to create the serialized annotations file if needed
+process_large_input_bed()  # Process large 'input_bed' and write results
diff --git a/pipeline/download_step1/cbioportal/find_incomplete.py b/pipeline/download_step1/cbioportal/find_incomplete.py
index b2b065b..a913a02 100644
--- a/pipeline/download_step1/cbioportal/find_incomplete.py
+++ b/pipeline/download_step1/cbioportal/find_incomplete.py
@@ -26,7 +26,7 @@ def find_incomplete_json_files(directory, output_file="incomplete_files.txt"):
     print(f"Found {len(incomplete_files)} incomplete files. Results saved to '{output_file}'.")
 
 # Specify the directory containing your JSON files
-directory_path = "/data/shared/pipelines/cbioportal/mutations"
+directory_path = "/data/shared/biomuta/downloads/cbioportal/2024_10_21/mutationss"
 
 # Run the function
 find_incomplete_json_files(directory_path)
diff --git a/pipeline/download_step1/cbioportal/call_config_with_py.txt b/pipeline/helper_scripts/call_config_with_py.txt
similarity index 100%
rename from pipeline/download_step1/cbioportal/call_config_with_py.txt
rename to pipeline/helper_scripts/call_config_with_py.txt
diff --git a/pipeline/download_step1/cbioportal/cleanup.sh b/pipeline/helper_scripts/cleanup.sh
similarity index 100%
rename from pipeline/download_step1/cbioportal/cleanup.sh
rename to pipeline/helper_scripts/cleanup.sh
diff --git a/pipeline/download_step1/cbioportal/combined_mapping.py b/pipeline/helper_scripts/combined_mapping.py
similarity index 100%
rename from pipeline/download_step1/cbioportal/combined_mapping.py
rename to pipeline/helper_scripts/combined_mapping.py
diff --git a/pipeline/download_step1/cbioportal/extract_uniq_do_names.py b/pipeline/helper_scripts/extract_uniq_do_names.py
similarity index 85%
rename from pipeline/download_step1/cbioportal/extract_uniq_do_names.py
rename to pipeline/helper_scripts/extract_uniq_do_names.py
index 5525a22..1bec2fa 100644
--- a/pipeline/download_step1/cbioportal/extract_uniq_do_names.py
+++ b/pipeline/helper_scripts/extract_uniq_do_names.py
@@ -1,6 +1,6 @@
 import json
 
-# Replace 'path_to_your_file.json' with the path to your JSON file
+# Path to JSON file
 file_path = '/data/shared/biomuta/generated/datasets/2024_10_22/cancer_types_with_do.json'
 
 # Open and load JSON data from the file
diff --git a/pipeline/download_step1/cbioportal/find_duplicates.sh b/pipeline/helper_scripts/find_duplicates.sh
similarity index 100%
rename from pipeline/download_step1/cbioportal/find_duplicates.sh
rename to pipeline/helper_scripts/find_duplicates.sh
diff --git a/pipeline/download_step1/cbioportal/instructions.md b/pipeline/helper_scripts/instructions.md
similarity index 100%
rename from pipeline/download_step1/cbioportal/instructions.md
rename to pipeline/helper_scripts/instructions.md
diff --git a/pipeline/download_step1/cbioportal/instructions.py b/pipeline/helper_scripts/instructions.py
similarity index 100%
rename from pipeline/download_step1/cbioportal/instructions.py
rename to pipeline/helper_scripts/instructions.py

From 16e9007f83503a0164675b71ff972e4a38d0f20f Mon Sep 17 00:00:00 2001
From: mariacuria <marianatkim-sch@outlook.com>
Date: Wed, 27 Nov 2024 14:18:16 -0500
Subject: [PATCH 5/7] Upd gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index c02bc7f..a4ae9f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 generated_datasets
+pipeline/convert_step2/liftover/annotations.pkl
 pipeline/convert_step2/liftover/Homo_sapiens.GRCh38.113.gff3
+utils/__pycache__/
 .*/
 ../downloads
 *.log

From ca0a660c01c9a202213fd92461eeb869a940e8c3 Mon Sep 17 00:00:00 2001
From: Reeya Gupta <guptareeya53@gmail.com>
Date: Tue, 3 Dec 2024 12:46:44 -0500
Subject: [PATCH 6/7] updated scripts with paths dynamically fetched from
 config.json

---
 pipeline/config.json                          |  4 ++-
 .../cbioportal/1_generate_cancer_do_json.py   | 28 +++++++++++--------
 .../cbioportal/2_ensp_to_uniprot_batch.sh     |  8 ++++--
 .../cbioportal/3_canonical_yes_no.py          | 19 +++++++++++--
 .../cbioportal/4_compare_fasta.py             | 17 ++++++++---
 pipeline/convert_step2/liftover/2_liftover.sh | 21 ++++++++------
 .../cbioportal/fetch_mutations_old.sh         | 12 +++++---
 .../cbioportal/find_incomplete.py             | 22 +++++++++++++--
 .../cbioportal/integrate_cancer_types.sh      | 22 ++++++++++-----
 9 files changed, 109 insertions(+), 44 deletions(-)

diff --git a/pipeline/config.json b/pipeline/config.json
index 69115d9..cc1bac0 100644
--- a/pipeline/config.json
+++ b/pipeline/config.json
@@ -1,5 +1,7 @@
 {
     "relevant_paths": {
+        "repos_generated_datasets": "/data/shared/repos/biomuta-old/generated_datasets",
+        "repos_downloads": "/data/shared/repos/biomuta-old/downloads",
         "downloads": "/data/shared/biomuta/downloads",
         "generated_datasets": "/data/shared/biomuta/generated/datasets",
         "mapping": "/data/shared/biomuta/pipeline/convert_step2/mapping"
@@ -7,4 +9,4 @@
     "resource_init": {
         "cbioportal": ["subfolder1", "subfolder2"]
     }
-}
\ No newline at end of file
+}
diff --git a/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py b/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py
index b8f487a..c78ed1d 100644
--- a/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py
+++ b/pipeline/convert_step2/cbioportal/1_generate_cancer_do_json.py
@@ -5,12 +5,12 @@
 from pathlib import Path
 from typing import Optional
 
+# Configure logging
 logging.basicConfig(filename="cancer_mapping.log",
                     filemode='a',
                     format='%(asctime)s %(levelname)s %(message)s',
                     datefmt='%Y-%m-%d %H:%M:%S',
                     level=logging.INFO)
-
 # Logging levels
 # 1. error
 # 2. warning
@@ -27,24 +27,27 @@
 #from utils import ROOT_DIR #__init__.py, ROOT_DIR is a global var
 
 # Define paths
-
 # Get the directory of this script
 script_dir = Path(__file__).resolve().parent
 # Navigate to config.json location relative to script
 config_dir = script_dir.parent.parent
-# Load config
-with open(config_dir/'config.json') as config_file:
+
+# Load config.json
+with open(config_dir / 'config.json') as config_file:
     config = json.load(config_file)
+
 # Access paths from config
+
 mapping_dir = Path(config["relevant_paths"]["mapping"])
 doid_mapping = mapping_dir / "combined_do_mapping.json"
 fallback_do_map = mapping_dir / "fallback_cbio_doid_mapping.json"
 
 # Input and output file names
-# Get the latest directory
-directory_path = Path(config["relevant_paths"]["generated_datasets"])
-latest_dir = max([d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))], key=lambda d: os.path.getctime(os.path.join(directory_path, d)))
-latest_dir = Path(directory_path) / latest_dir
+# Get the latest directory in generated_datasets
+generated_datasets_dir = Path(config["relevant_paths"]["generated_datasets"])
+latest_dir = max([d for d in os.listdir(generated_datasets_dir) if os.path.isdir(os.path.join(generated_datasets_dir, d))], key=lambda d: os.path.getctime(os.path.join(generated_datasets_dir, d)))
+latest_dir = Path(generated_datasets_dir) / latest_dir
+
 def ask_confirmation(prompt):
     while True:
         user_input = input(f"{prompt} (y/n): ").strip().lower()
@@ -54,11 +57,12 @@ def ask_confirmation(prompt):
             return False
         else:
             print(f"Invalid input. Please enter 'y' for yes or 'n' for no.")
+
 if ask_confirmation(f"The latest created directory is: {latest_dir}. Proceed?"):
-    input_file = Path(latest_dir) / "unique_cancer_names.txt"
-    cancer_types_with_do = Path(latest_dir) / "cancer_types_with_do.json"
-    cancer_type_per_study = Path(latest_dir) / "cancer_type_per_study.json"
-    study_ids_with_do = Path(latest_dir) / "study_ids_with_do.json"
+    input_file = latest_dir / "unique_cancer_names.txt"
+    cancer_types_with_do = latest_dir / "cancer_types_with_do.json"
+    cancer_type_per_study = latest_dir / "cancer_type_per_study.json"
+    study_ids_with_do = latest_dir / "study_ids_with_do.json"
     print(f"Using {latest_dir}/unique_cancer_names.txt and writing out to {latest_dir}/cancer_types_with_do.json")
 else:
     sys.exit("Aborted by user.")
diff --git a/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
index c4dc21d..8339881 100755
--- a/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
+++ b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
@@ -1,8 +1,12 @@
 #!/bin/bash
 
+# Load config.json and extract paths
+config_file="$(dirname "$(dirname "$(realpath "$0")")")/config.json"
+repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$config_file")
+
 # Input and output file paths
-input_csv="/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv"      # Input CSV file
-output_json="/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json"   # Output JSON file
+input_csv="$repos_generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv"      # Input CSV file
+output_json="$repos_generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json"   # Output JSON file
 
 batch_size=5000            # Number of ENSP IDs per batch (adjustable)
 
diff --git a/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
index 492ba1a..dfee946 100644
--- a/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
+++ b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
@@ -1,12 +1,25 @@
 import json
 import pandas as pd
+from pathlib import Path
+
+# Load config.json
+config_path = Path(__file__).resolve().parent.parent / "config.json"
+with open(config_path, "r") as config_file:
+    config = json.load(config_file)
+
+# Retrieve paths from updated config
+repos_generated_datasets = Path(config["relevant_paths"]["repos_generated_datasets"])
+repos_downloads = Path(config["relevant_paths"]["repos_downloads"])
+isoform_data_path = repos_downloads / "glygen/human_protein_masterlist.csv"
+ensp_to_uniprot_path = repos_generated_datasets / "2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json"
+canonical_toy_output_path = repos_generated_datasets / "2024_10_22/mapping_ids/canonical_toy.json"
 
 # Load the ENSP to UniProt mapping JSON
-with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json", "r") as f:
+with open(ensp_to_uniprot_path, "r") as f:
     ensp_to_uniprot = json.load(f)
 
 # Load the isoform data CSV
-isoform_data = pd.read_csv("/data/shared/repos/biomuta-old/downloads/glygen/human_protein_masterlist.csv")
+isoform_data = pd.read_csv(isoform_data_path)
 
 # Prepare a dictionary to store the results
 result = {}
@@ -42,5 +55,5 @@ def strip_suffix(identifier):
             break  # Exit inner loop once the first match is found
 
 # Write the result to a JSON file
-with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json", "w") as json_file:
+with open(canonical_toy_output_path, "w") as json_file:
     json.dump(result, json_file, indent=4)
diff --git a/pipeline/convert_step2/cbioportal/4_compare_fasta.py b/pipeline/convert_step2/cbioportal/4_compare_fasta.py
index a59ecb0..3538615 100644
--- a/pipeline/convert_step2/cbioportal/4_compare_fasta.py
+++ b/pipeline/convert_step2/cbioportal/4_compare_fasta.py
@@ -1,13 +1,22 @@
 import requests
 import json
+from pathlib import Path
+
+# Load config.json
+config_path = Path(__file__).resolve().parent.parent / "config.json"
+with open(config_path, "r") as config_file:
+    config = json.load(config_file)
+
+# Retrieve  paths from config.json
+repos_base = Path(config["relevant_paths"]["repos"])
+ensembl_uniprot_map_path = repos_base / "2024_10_22/mapping_ids/canonical_toy.json"
 
 # Load your JSON file containing ENSEMBL to UniProt mappings
-with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical_toy.json") as file:
+with open(ensembl_uniprot_map_path, "r") as file:
     ensembl_uniprot_map = json.load(file)
 
-def fetch_ensembl_sequence(ensembl_id): #it works
+def fetch_ensembl_sequence(ensembl_id):  # Fetch ENSEMBL sequence
     url = f"https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/plain"
-    #https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/x-fasta"
     response = requests.get(url)
     if response.status_code == 200:
         return response.text.strip()
@@ -15,7 +24,7 @@ def fetch_ensembl_sequence(ensembl_id): #it works
         print(f"Failed to fetch ENSEMBL sequence for {ensembl_id}")
         return None
 
-def fetch_uniprot_sequence(uniprot_id): #it works
+def fetch_uniprot_sequence(uniprot_id):  # Fetch UniProt sequence
     url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
     response = requests.get(url)
     if response.status_code == 200:
diff --git a/pipeline/convert_step2/liftover/2_liftover.sh b/pipeline/convert_step2/liftover/2_liftover.sh
index 1f2617f..f25715b 100755
--- a/pipeline/convert_step2/liftover/2_liftover.sh
+++ b/pipeline/convert_step2/liftover/2_liftover.sh
@@ -1,9 +1,12 @@
 #!/bin/bash
 
-wd="/data/shared/biomuta/generated/datasets/2024_10_22/liftover"
+# Load paths from config.json using jq
+config_file="/path/to/config.json" # Replace with the actual path to your config.json
+generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $config_file)
+liftover_dir="${generated_datasets}/2024_10_22/liftover"
 
 # Extract rows with GRCh38 and save as tab-separated:
-awk '$5 == "GRCh38"' ${wd}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${wd}/cbio_hg38.bed
+awk '$5 == "GRCh38"' ${liftover_dir}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${liftover_dir}/cbio_hg38.bed
 # Check if the extraction and modification were successful
 if [ $? -ne 0 ]; then
     echo "Error extracting or modifying rows for GRCh38."
@@ -11,7 +14,7 @@ if [ $? -ne 0 ]; then
 fi
 
 # Extract all other rows (where the 5th column is not GRCh38) and save as tab-separated:
-awk '$5 != "GRCh38"' ${wd}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${wd}/hg19entrez_protChange.bed
+awk '$5 != "GRCh38"' ${liftover_dir}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${liftover_dir}/hg19entrez_protChange.bed
 # Check if the extraction and modification were successful
 if [ $? -ne 0 ]; then
     echo "Error extracting or modifying non-GRCh38 rows."
@@ -19,7 +22,7 @@ if [ $? -ne 0 ]; then
 fi
 
 # Run liftOver to convert coordinates (first chain)
-./liftOver ${wd}/hg19entrez_protChange.bed ucscHg19ToHg38.over.chain ${wd}/ucsc_hg38entrez_protChange.bed ${wd}/ucsc_unmapped_entrez_protChange.bed
+./liftOver ${liftover_dir}/hg19entrez_protChange.bed ucscHg19ToHg38.over.chain ${liftover_dir}/ucsc_hg38entrez_protChange.bed ${liftover_dir}/ucsc_unmapped_entrez_protChange.bed
 
 # Check if the first liftOver was successful
 if [ $? -ne 0 ]; then
@@ -28,7 +31,7 @@ if [ $? -ne 0 ]; then
 fi
 
 # Run liftOver to convert coordinates (second chain)
-./liftOver ${wd}/ucsc_unmapped_entrez_protChange.bed ensembl_GRCh37_to_GRCh38.chain ${wd}/ensembl_hg38entrez_protChange.bed ${wd}/ensembl_unmapped_entrez_protChange.bed
+./liftOver ${liftover_dir}/ucsc_unmapped_entrez_protChange.bed ensembl_GRCh37_to_GRCh38.chain ${liftover_dir}/ensembl_hg38entrez_protChange.bed ${liftover_dir}/ensembl_unmapped_entrez_protChange.bed
 
 # Check if the second liftOver was successful
 if [ $? -ne 0 ]; then
@@ -37,14 +40,14 @@ if [ $? -ne 0 ]; then
 fi
 
 # Prepend 'chr' to the 1st column of ensembl_hg38entrez_protChange.bed
-sed 's/^\([a-zA-Z0-9]*\)/chr\1/' ${wd}/ensembl_hg38entrez_protChange.bed > ${wd}/temp && mv ${wd}/temp ${wd}/ensembl_hg38entrez_protChange.bed
+sed 's/^\([a-zA-Z0-9]*\)/chr\1/' ${liftover_dir}/ensembl_hg38entrez_protChange.bed > ${liftover_dir}/temp && mv ${liftover_dir}/temp ${liftover_dir}/ensembl_hg38entrez_protChange.bed
 
 # Combine all hg38 files
-cat ${wd}/cbio_hg38.bed ${wd}/ucsc_hg38entrez_protChange.bed ${wd}/ensembl_hg38entrez_protChange.bed > ${wd}/hg38_combined.bed
+cat ${liftover_dir}/cbio_hg38.bed ${liftover_dir}/ucsc_hg38entrez_protChange.bed ${liftover_dir}/ensembl_hg38entrez_protChange.bed > ${liftover_dir}/hg38_combined.bed
 # Remove duplicate rows taking into account extra tabs
-awk -v OFS='\t' '{$1=$1; print}' ${wd}/hg38_combined.bed | sort -u > ${wd}/temp && mv ${wd}/temp ${wd}/hg38_combined.bed
+awk -v OFS='\t' '{$1=$1; print}' ${liftover_dir}/hg38_combined.bed | sort -u > ${liftover_dir}/temp && mv ${liftover_dir}/temp ${liftover_dir}/hg38_combined.bed
 # Add headers with tab separation
-sed -i '1i chr_id\tstart_pos\tend_pos\tentrez_gene_id\tprot_change' ${wd}/hg38_combined.bed
+sed -i '1i chr_id\tstart_pos\tend_pos\tentrez_gene_id\tprot_change' ${liftover_dir}/hg38_combined.bed
 
 echo "Script completed successfully."
 
diff --git a/pipeline/download_step1/cbioportal/fetch_mutations_old.sh b/pipeline/download_step1/cbioportal/fetch_mutations_old.sh
index fc0291c..4175254 100755
--- a/pipeline/download_step1/cbioportal/fetch_mutations_old.sh
+++ b/pipeline/download_step1/cbioportal/fetch_mutations_old.sh
@@ -1,17 +1,21 @@
 #!/bin/bash
 
+# Load paths from config.json
+CONFIG_FILE="/path/to/config.json"
+DOWNLOADS_DIR=$(jq -r '.relevant_paths.generated_datasets' "$CONFIG_FILE")
+
 # Today's date
 TODAY=$(date +"%Y_%m_%d")
 
-# Download directory
-OUTPUT_DIR="/data/shared/biomuta/generated/datasets/${TODAY}"
+# Output directory
+OUTPUT_DIR="${DOWNLOADS_DIR}/${TODAY}"
 mkdir -p "${OUTPUT_DIR}/mutations"
 
 # Base URLs for the API
 STUDY_URL="https://www.cbioportal.org/api/studies"
 MUTATIONS_URL="https://www.cbioportal.org/api/molecular-profiles"
 
-#get study IDs
+# Get study IDs
 curl -G "${STUDY_URL}" \
      -H "accept: application/json" \
      -o "${OUTPUT_DIR}/all_studies.json"
@@ -82,4 +86,4 @@ while IFS= read -r study_id; do
     done
   done
 
-done < "$STUDY_IDS_FILE"
\ No newline at end of file
+done < "$STUDY_IDS_FILE"
diff --git a/pipeline/download_step1/cbioportal/find_incomplete.py b/pipeline/download_step1/cbioportal/find_incomplete.py
index a913a02..e59b0bf 100644
--- a/pipeline/download_step1/cbioportal/find_incomplete.py
+++ b/pipeline/download_step1/cbioportal/find_incomplete.py
@@ -1,7 +1,16 @@
 import os
 import ijson
+import json
+from pathlib import Path
 
 def find_incomplete_json_files(directory, output_file="incomplete_files.txt"):
+    """
+    Function to find incomplete or corrupted JSON files in the specified directory.
+
+    Args:
+        directory (str): Directory containing JSON files.
+        output_file (str): File to save the names of incomplete JSON files.
+    """
     # List to store the names of incomplete files
     incomplete_files = []
 
@@ -25,8 +34,17 @@ def find_incomplete_json_files(directory, output_file="incomplete_files.txt"):
 
     print(f"Found {len(incomplete_files)} incomplete files. Results saved to '{output_file}'.")
 
-# Specify the directory containing your JSON files
-directory_path = "/data/shared/biomuta/downloads/cbioportal/2024_10_21/mutationss"
+# Load config.json
+config_path = Path(__file__).resolve().parent.parent / "config.json"
+with open(config_path, "r") as config_file:
+    config = json.load(config_file)
+
+# Get base directory from config
+downloads_base = Path(config["relevant_paths"]["downloads"]) / "cbioportal /2024_10_21"
+
+
+directory_path = downloads_base / "mutations"
+
 
 # Run the function
 find_incomplete_json_files(directory_path)
diff --git a/pipeline/download_step1/cbioportal/integrate_cancer_types.sh b/pipeline/download_step1/cbioportal/integrate_cancer_types.sh
index 85bc880..62f2039 100644
--- a/pipeline/download_step1/cbioportal/integrate_cancer_types.sh
+++ b/pipeline/download_step1/cbioportal/integrate_cancer_types.sh
@@ -1,12 +1,18 @@
 #!/bin/bash
 
-# This script extracts study IDs and the corresponding cancer names. It takes as input the output of cancer_types.sh
+# Load the config file to dynamically retrieve paths
+config_file="path/to/config.json"
 
-# Define the directory where your JSON files are located
-input_dir="/data/shared/biomuta/downloads/cbioportal/2024_10_21/cancer_types"
+# Get paths from config
+input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$config_file")
+output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$config_file")
 
-# Define the output TSV file
-output_file="/data/shared/biomuta/generated/datasets/2024_10_22/cancer_type_per_study.json"
+# Define the output files
+output_file="$output_dir/cancer_type_per_study.json"
+unique_cancer_names_file="$output_dir/unique_cancer_names.json"
+
+# Create the output directory if it doesn't exist
+mkdir -p "$output_dir"
 
 # Initialize the JSON array
 echo "[" > "$output_file"
@@ -42,5 +48,7 @@ echo "]" >> "$output_file"
 
 echo "Data successfully written to $output_file"
 
-# Make a list of unique cancer names in json format
-jq -r '.[].cancerType' $output_file | sort | uniq | jq -R . | jq -s . > unique_cancer_names.json
\ No newline at end of file
+# Make a list of unique cancer names in JSON format
+jq -r '.[].cancerType' "$output_file" | sort | uniq | jq -R . | jq -s . > "$unique_cancer_names_file"
+
+echo "Unique cancer names written to $unique_cancer_names_file"

From 48d1a1ea64db2b2743c9e428b43894395f8bde0e Mon Sep 17 00:00:00 2001
From: Reeya Gupta <guptareeya53@gmail.com>
Date: Wed, 4 Dec 2024 13:12:05 -0500
Subject: [PATCH 7/7] Updated paths to config file

---
 .../cbioportal/2_ensp_to_uniprot_batch.sh     |  8 ++--
 .../cbioportal/3_canonical_yes_no.py          |  2 +-
 .../cbioportal/4_compare_fasta.py             |  4 +-
 .../liftover/1_chr_pos_to_bed.py              | 40 ++++++++++---------
 pipeline/convert_step2/liftover/2_liftover.sh |  9 +++--
 .../cbioportal/fetch_mutations_old.sh         |  6 ++-
 .../cbioportal/find_incomplete.py             |  2 +-
 .../cbioportal/integrate_cancer_types.sh      | 12 ++++--
 8 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
index 8339881..432dc1e 100755
--- a/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
+++ b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
@@ -1,8 +1,10 @@
 #!/bin/bash
 
-# Load config.json and extract paths
-config_file="$(dirname "$(dirname "$(realpath "$0")")")/config.json"
-repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$config_file")
+# Get this script's directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Load config.json
+CONFIG_FILE="$SCRIPT_DIR/../../config.json"
+repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$CONFIG_FILE")
 
 # Input and output file paths
 input_csv="$repos_generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv"      # Input CSV file
diff --git a/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
index dfee946..e1a055d 100644
--- a/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
+++ b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 # Load config.json
-config_path = Path(__file__).resolve().parent.parent / "config.json"
+config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
 with open(config_path, "r") as config_file:
     config = json.load(config_file)
 
diff --git a/pipeline/convert_step2/cbioportal/4_compare_fasta.py b/pipeline/convert_step2/cbioportal/4_compare_fasta.py
index 3538615..9e385c1 100644
--- a/pipeline/convert_step2/cbioportal/4_compare_fasta.py
+++ b/pipeline/convert_step2/cbioportal/4_compare_fasta.py
@@ -3,12 +3,12 @@
 from pathlib import Path
 
 # Load config.json
-config_path = Path(__file__).resolve().parent.parent / "config.json"
+config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
 with open(config_path, "r") as config_file:
     config = json.load(config_file)
 
 # Retrieve  paths from config.json
-repos_base = Path(config["relevant_paths"]["repos"])
+repos_base = Path(config["relevant_paths"]["repos_generated_datasets"])
 ensembl_uniprot_map_path = repos_base / "2024_10_22/mapping_ids/canonical_toy.json"
 
 # Load your JSON file containing ENSEMBL to UniProt mappings
diff --git a/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py b/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py
index a976bc3..86ef4a7 100644
--- a/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py
+++ b/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py
@@ -16,17 +16,27 @@
 #GRCh37: 30925146
 #NA: 3143
 #GRCh38: 6253932
-
 import os
 import json
 import glob
 import sys
 from pathlib import Path
 
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent.parent))
-from utils import ROOT_DIR
-from utils.config import get_config
+# Load config.json directly and ensure paths are resolved correctly
+CONFIG_FILE = Path(__file__).resolve().parent.parent.parent / "config.json"
+if not CONFIG_FILE.exists():
+    raise FileNotFoundError(f"Config file not found at {CONFIG_FILE}")
+
+with open(CONFIG_FILE, "r") as config_file:
+    config = json.load(config_file)
 
+# Retrieve paths from config
+downloads_dir = Path(config["relevant_paths"]["downloads"])
+generated_datasets_dir = Path(config["relevant_paths"]["generated_datasets"])
+
+# Define input and output directories
+input_directory = downloads_dir / "cbioportal" / "2024_10_21" / "mutations"
+output_bed_file = generated_datasets_dir / "2024_10_22" / "liftover" / "hg19entrez_build_protChange.bed"
 
 def process_json_to_bed(input_directory, output_bed_file):
     buffer = []
@@ -49,26 +59,25 @@ def process_json_to_bed(input_directory, output_bed_file):
                             # Check genome build and SNP criteria
                             if record.get('ncbiBuild') == 'NA':
                                 continue
-                            if record.get('variantType') != 'SNP': #take SNPs only
+                            if record.get('variantType') != 'SNP':  # Take SNPs only
                                 continue
-                            if record['endPosition'] - record['startPosition'] != 0: #additional check to confirm SNP
+                            if record['endPosition'] - record['startPosition'] != 0:  # Additional check to confirm SNP
                                 continue
-                            if 'splice' in record.get('proteinChange', ''): #no splice site mutations
+                            if 'splice' in record.get('proteinChange', ''):  # No splice site mutations
                                 continue
 
                             # Extract chromosome, start position, and end position
-
                             chr_ = record['chr']
                             # Convert specific chromosome values and exclude unwanted chromosomes
-                            if chr_ == '23': 
+                            if chr_ == '23':
                                 chr_ = 'X'
                             if chr_ == '24':
                                 chr_ = 'Y'
                             if chr_ in ['MT', 'NA']:
-                                continue # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values
+                                continue  # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values
                             if not chr_.startswith('chr'):
                                 chr_ = 'chr' + chr_
-                            start_pos = record['startPosition'] - 1 # Convert to 0-based for BED
+                            start_pos = record['startPosition'] - 1  # Convert to 0-based for BED
                             end_pos = record['endPosition']
                             entrez_gene_id = record['entrezGeneId']
                             ncbi_build = record['ncbiBuild']
@@ -77,10 +86,10 @@ def process_json_to_bed(input_directory, output_bed_file):
                             row = f"{chr_}\t{start_pos}\t{end_pos}\t{entrez_gene_id}\t{ncbi_build}\t{protein_change}\n"
                             if row not in unique_rows:  # Only add unique rows
                                 unique_rows.add(row)
-                                buffer.append(row) # Append row to buffer
+                                buffer.append(row)  # Append row to buffer
 
                             # Write buffer to file when it reaches the specified size
-                            if len(buffer) >= buffer_size: 
+                            if len(buffer) >= buffer_size:
                                 bed_file.writelines(buffer)
                                 buffer.clear()
 
@@ -110,9 +119,4 @@ def process_json_to_bed(input_directory, output_bed_file):
 
 
 # Run the function
-config_obj = get_config()
-dl_dir = Path(config_obj["relevant_paths"]["downloads"])
-out_dir = Path(config_obj["relevant_paths"]["generated_datasets"])
-input_directory = dl_dir / 'cbioportal' / '2024_10_21' / 'mutations'  # Write a util to get latest dir
-output_bed_file = out_dir / '2024_10_22' / 'liftover' / 'hg19entrez_build_protChange.bed' #Write a util to get latest dir
 process_json_to_bed(input_directory, output_bed_file)
diff --git a/pipeline/convert_step2/liftover/2_liftover.sh b/pipeline/convert_step2/liftover/2_liftover.sh
index f25715b..787e354 100755
--- a/pipeline/convert_step2/liftover/2_liftover.sh
+++ b/pipeline/convert_step2/liftover/2_liftover.sh
@@ -1,8 +1,11 @@
 #!/bin/bash
 
-# Load paths from config.json using jq
-config_file="/path/to/config.json" # Replace with the actual path to your config.json
-generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $config_file)
+# Get this script's directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Load config.json
+CONFIG_FILE="$SCRIPT_DIR/../../config.json"
+
+generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $CONFIG_FILE)
 liftover_dir="${generated_datasets}/2024_10_22/liftover"
 
 # Extract rows with GRCh38 and save as tab-separated:
diff --git a/pipeline/download_step1/cbioportal/fetch_mutations_old.sh b/pipeline/download_step1/cbioportal/fetch_mutations_old.sh
index 4175254..90c9eea 100755
--- a/pipeline/download_step1/cbioportal/fetch_mutations_old.sh
+++ b/pipeline/download_step1/cbioportal/fetch_mutations_old.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
 
-# Load paths from config.json
-CONFIG_FILE="/path/to/config.json"
+# Get this script's directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Load config.json
+CONFIG_FILE="$SCRIPT_DIR/../../config.json"
 DOWNLOADS_DIR=$(jq -r '.relevant_paths.generated_datasets' "$CONFIG_FILE")
 
 # Today's date
diff --git a/pipeline/download_step1/cbioportal/find_incomplete.py b/pipeline/download_step1/cbioportal/find_incomplete.py
index e59b0bf..c509690 100644
--- a/pipeline/download_step1/cbioportal/find_incomplete.py
+++ b/pipeline/download_step1/cbioportal/find_incomplete.py
@@ -35,7 +35,7 @@ def find_incomplete_json_files(directory, output_file="incomplete_files.txt"):
     print(f"Found {len(incomplete_files)} incomplete files. Results saved to '{output_file}'.")
 
 # Load config.json
-config_path = Path(__file__).resolve().parent.parent / "config.json"
+config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
 with open(config_path, "r") as config_file:
     config = json.load(config_file)
 
diff --git a/pipeline/download_step1/cbioportal/integrate_cancer_types.sh b/pipeline/download_step1/cbioportal/integrate_cancer_types.sh
index 62f2039..5186ba4 100644
--- a/pipeline/download_step1/cbioportal/integrate_cancer_types.sh
+++ b/pipeline/download_step1/cbioportal/integrate_cancer_types.sh
@@ -1,11 +1,15 @@
 #!/bin/bash
 
-# Load the config file to dynamically retrieve paths
-config_file="path/to/config.json"
+
+# Get this script's directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Load config.json
+CONFIG_FILE="$SCRIPT_DIR/../../config.json"
+
 
 # Get paths from config
-input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$config_file")
-output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$config_file")
+input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$CONFIG_FILE")
+output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$CONFIG_FILE")
 
 # Define the output files
 output_file="$output_dir/cancer_type_per_study.json"