GW-HIVE · Reeya123 · Nov 26, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh b/pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
@@ -1,8 +1,10 @@
 #!/bin/bash
 
-# Load config.json and extract paths
-config_file="$(dirname "$(dirname "$(realpath "$0")")")/config.json"
-repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$config_file")
+# Get this script's directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Load config.json
+CONFIG_FILE="$SCRIPT_DIR/../../config.json"
+repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$CONFIG_FILE")
 
 # Input and output file paths
 input_csv="$repos_generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv"      # Input CSV file

diff --git a/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py b/pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 # Load config.json
-config_path = Path(__file__).resolve().parent.parent / "config.json"
+config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
 with open(config_path, "r") as config_file:
     config = json.load(config_file)
 

diff --git a/pipeline/convert_step2/cbioportal/4_compare_fasta.py b/pipeline/convert_step2/cbioportal/4_compare_fasta.py
@@ -3,12 +3,12 @@
 from pathlib import Path
 
 # Load config.json
-config_path = Path(__file__).resolve().parent.parent / "config.json"
+config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
 with open(config_path, "r") as config_file:
     config = json.load(config_file)
 
 # Retrieve  paths from config.json
-repos_base = Path(config["relevant_paths"]["repos"])
+repos_base = Path(config["relevant_paths"]["repos_generated_datasets"])
 ensembl_uniprot_map_path = repos_base / "2024_10_22/mapping_ids/canonical_toy.json"
 
 # Load your JSON file containing ENSEMBL to UniProt mappings

diff --git a/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py b/pipeline/convert_step2/liftover/1_chr_pos_to_bed.py
@@ -16,17 +16,27 @@
 #GRCh37: 30925146
 #NA: 3143
 #GRCh38: 6253932
-
 import os
 import json
 import glob
 import sys
 from pathlib import Path
 
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent.parent))
-from utils import ROOT_DIR
-from utils.config import get_config
+# Load config.json directly and ensure paths are resolved correctly
+CONFIG_FILE = Path(__file__).resolve().parent.parent.parent / "config.json"
+if not CONFIG_FILE.exists():
+    raise FileNotFoundError(f"Config file not found at {CONFIG_FILE}")
+
+with open(CONFIG_FILE, "r") as config_file:
+    config = json.load(config_file)
 
+# Retrieve paths from config
+downloads_dir = Path(config["relevant_paths"]["downloads"])
+generated_datasets_dir = Path(config["relevant_paths"]["generated_datasets"])
+
+# Define input and output directories
+input_directory = downloads_dir / "cbioportal" / "2024_10_21" / "mutations"
+output_bed_file = generated_datasets_dir / "2024_10_22" / "liftover" / "hg19entrez_build_protChange.bed"
 
 def process_json_to_bed(input_directory, output_bed_file):
     buffer = []
@@ -49,26 +59,25 @@ def process_json_to_bed(input_directory, output_bed_file):
                             # Check genome build and SNP criteria
                             if record.get('ncbiBuild') == 'NA':
                                 continue
-                            if record.get('variantType') != 'SNP': #take SNPs only
+                            if record.get('variantType') != 'SNP':  # Take SNPs only
                                 continue
-                            if record['endPosition'] - record['startPosition'] != 0: #additional check to confirm SNP
+                            if record['endPosition'] - record['startPosition'] != 0:  # Additional check to confirm SNP
                                 continue
-                            if 'splice' in record.get('proteinChange', ''): #no splice site mutations
+                            if 'splice' in record.get('proteinChange', ''):  # No splice site mutations
                                 continue
 
                             # Extract chromosome, start position, and end position
-
                             chr_ = record['chr']
                             # Convert specific chromosome values and exclude unwanted chromosomes
-                            if chr_ == '23': 
+                            if chr_ == '23':
                                 chr_ = 'X'
                             if chr_ == '24':
                                 chr_ = 'Y'
                             if chr_ in ['MT', 'NA']:
-                                continue # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values
+                                continue  # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values
                             if not chr_.startswith('chr'):
                                 chr_ = 'chr' + chr_
-                            start_pos = record['startPosition'] - 1 # Convert to 0-based for BED
+                            start_pos = record['startPosition'] - 1  # Convert to 0-based for BED
                             end_pos = record['endPosition']
                             entrez_gene_id = record['entrezGeneId']
                             ncbi_build = record['ncbiBuild']
@@ -77,10 +86,10 @@ def process_json_to_bed(input_directory, output_bed_file):
                             row = f"{chr_}\t{start_pos}\t{end_pos}\t{entrez_gene_id}\t{ncbi_build}\t{protein_change}\n"
                             if row not in unique_rows:  # Only add unique rows
                                 unique_rows.add(row)
-                                buffer.append(row) # Append row to buffer
+                                buffer.append(row)  # Append row to buffer
 
                             # Write buffer to file when it reaches the specified size
-                            if len(buffer) >= buffer_size: 
+                            if len(buffer) >= buffer_size:
                                 bed_file.writelines(buffer)
                                 buffer.clear()
 
@@ -110,9 +119,4 @@ def process_json_to_bed(input_directory, output_bed_file):
 
 
 # Run the function
-config_obj = get_config()
-dl_dir = Path(config_obj["relevant_paths"]["downloads"])
-out_dir = Path(config_obj["relevant_paths"]["generated_datasets"])
-input_directory = dl_dir / 'cbioportal' / '2024_10_21' / 'mutations'  # Write a util to get latest dir
-output_bed_file = out_dir / '2024_10_22' / 'liftover' / 'hg19entrez_build_protChange.bed' #Write a util to get latest dir
 process_json_to_bed(input_directory, output_bed_file)
diff --git a/pipeline/convert_step2/liftover/2_liftover.sh b/pipeline/convert_step2/liftover/2_liftover.sh
@@ -1,8 +1,11 @@
 #!/bin/bash
 
-# Load paths from config.json using jq
-config_file="/path/to/config.json" # Replace with the actual path to your config.json
-generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $config_file)
+# Get this script's directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Load config.json
+CONFIG_FILE="$SCRIPT_DIR/../../config.json"
+
+generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $CONFIG_FILE)
 liftover_dir="${generated_datasets}/2024_10_22/liftover"
 
 # Extract rows with GRCh38 and save as tab-separated:

diff --git a/pipeline/download_step1/cbioportal/fetch_mutations_old.sh b/pipeline/download_step1/cbioportal/fetch_mutations_old.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
 
-# Load paths from config.json
-CONFIG_FILE="/path/to/config.json"
+# Get this script's directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Load config.json
+CONFIG_FILE="$SCRIPT_DIR/../../config.json"
 DOWNLOADS_DIR=$(jq -r '.relevant_paths.generated_datasets' "$CONFIG_FILE")
 
 # Today's date

diff --git a/pipeline/download_step1/cbioportal/find_incomplete.py b/pipeline/download_step1/cbioportal/find_incomplete.py
@@ -35,7 +35,7 @@ def find_incomplete_json_files(directory, output_file="incomplete_files.txt"):
     print(f"Found {len(incomplete_files)} incomplete files. Results saved to '{output_file}'.")
 
 # Load config.json
-config_path = Path(__file__).resolve().parent.parent / "config.json"
+config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
 with open(config_path, "r") as config_file:
     config = json.load(config_file)
 

diff --git a/pipeline/download_step1/cbioportal/integrate_cancer_types.sh b/pipeline/download_step1/cbioportal/integrate_cancer_types.sh
@@ -1,11 +1,15 @@
 #!/bin/bash
 
-# Load the config file to dynamically retrieve paths
-config_file="path/to/config.json"
+
+# Get this script's directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Load config.json
+CONFIG_FILE="$SCRIPT_DIR/../../config.json"
+
 
 # Get paths from config
-input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$config_file")
-output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$config_file")
+input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$CONFIG_FILE")
+output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$CONFIG_FILE")
 
 # Define the output files
 output_file="$output_dir/cancer_type_per_study.json"