Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get rid of Hardcoded paths #23

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions pipeline/convert_step2/cbioportal/2_ensp_to_uniprot_batch.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/bin/bash

# Load config.json and extract paths
config_file="$(dirname "$(dirname "$(realpath "$0")")")/config.json"
repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$config_file")
# Get this script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Load config.json
CONFIG_FILE="$SCRIPT_DIR/../../config.json"
repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$CONFIG_FILE")

# Input and output file paths
input_csv="$repos_generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv" # Input CSV file
Expand Down
2 changes: 1 addition & 1 deletion pipeline/convert_step2/cbioportal/3_canonical_yes_no.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path

# Load config.json
config_path = Path(__file__).resolve().parent.parent / "config.json"
config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
with open(config_path, "r") as config_file:
config = json.load(config_file)

Expand Down
4 changes: 2 additions & 2 deletions pipeline/convert_step2/cbioportal/4_compare_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from pathlib import Path

# Load config.json
config_path = Path(__file__).resolve().parent.parent / "config.json"
config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
with open(config_path, "r") as config_file:
config = json.load(config_file)

# Retrieve paths from config.json
repos_base = Path(config["relevant_paths"]["repos"])
repos_base = Path(config["relevant_paths"]["repos_generated_datasets"])
ensembl_uniprot_map_path = repos_base / "2024_10_22/mapping_ids/canonical_toy.json"

# Load your JSON file containing ENSEMBL to UniProt mappings
Expand Down
40 changes: 22 additions & 18 deletions pipeline/convert_step2/liftover/1_chr_pos_to_bed.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please revert to using utils and call get_config() to get to the config file? I got this method from Sean, it allows me to avoid writing "with open <...> json.load" in every script.

Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,27 @@
#GRCh37: 30925146
#NA: 3143
#GRCh38: 6253932

import os
import json
import glob
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parent.parent.parent.parent))
from utils import ROOT_DIR
from utils.config import get_config
# Load config.json directly and ensure paths are resolved correctly
CONFIG_FILE = Path(__file__).resolve().parent.parent.parent / "config.json"
if not CONFIG_FILE.exists():
raise FileNotFoundError(f"Config file not found at {CONFIG_FILE}")

with open(CONFIG_FILE, "r") as config_file:
config = json.load(config_file)

# Retrieve paths from config
downloads_dir = Path(config["relevant_paths"]["downloads"])
generated_datasets_dir = Path(config["relevant_paths"]["generated_datasets"])

# Define input and output directories
input_directory = downloads_dir / "cbioportal" / "2024_10_21" / "mutations"
output_bed_file = generated_datasets_dir / "2024_10_22" / "liftover" / "hg19entrez_build_protChange.bed"

def process_json_to_bed(input_directory, output_bed_file):
buffer = []
Expand All @@ -49,26 +59,25 @@ def process_json_to_bed(input_directory, output_bed_file):
# Check genome build and SNP criteria
if record.get('ncbiBuild') == 'NA':
continue
if record.get('variantType') != 'SNP': #take SNPs only
if record.get('variantType') != 'SNP': # Take SNPs only
continue
if record['endPosition'] - record['startPosition'] != 0: #additional check to confirm SNP
if record['endPosition'] - record['startPosition'] != 0: # Additional check to confirm SNP
continue
if 'splice' in record.get('proteinChange', ''): #no splice site mutations
if 'splice' in record.get('proteinChange', ''): # No splice site mutations
continue

# Extract chromosome, start position, and end position

chr_ = record['chr']
# Convert specific chromosome values and exclude unwanted chromosomes
if chr_ == '23':
if chr_ == '23':
chr_ = 'X'
if chr_ == '24':
chr_ = 'Y'
if chr_ in ['MT', 'NA']:
continue # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values
continue # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values
if not chr_.startswith('chr'):
chr_ = 'chr' + chr_
start_pos = record['startPosition'] - 1 # Convert to 0-based for BED
start_pos = record['startPosition'] - 1 # Convert to 0-based for BED
end_pos = record['endPosition']
entrez_gene_id = record['entrezGeneId']
ncbi_build = record['ncbiBuild']
Expand All @@ -77,10 +86,10 @@ def process_json_to_bed(input_directory, output_bed_file):
row = f"{chr_}\t{start_pos}\t{end_pos}\t{entrez_gene_id}\t{ncbi_build}\t{protein_change}\n"
if row not in unique_rows: # Only add unique rows
unique_rows.add(row)
buffer.append(row) # Append row to buffer
buffer.append(row) # Append row to buffer

# Write buffer to file when it reaches the specified size
if len(buffer) >= buffer_size:
if len(buffer) >= buffer_size:
bed_file.writelines(buffer)
buffer.clear()

Expand Down Expand Up @@ -110,9 +119,4 @@ def process_json_to_bed(input_directory, output_bed_file):


# Run the function
config_obj = get_config()
dl_dir = Path(config_obj["relevant_paths"]["downloads"])
out_dir = Path(config_obj["relevant_paths"]["generated_datasets"])
input_directory = dl_dir / 'cbioportal' / '2024_10_21' / 'mutations' # Write a util to get latest dir
output_bed_file = out_dir / '2024_10_22' / 'liftover' / 'hg19entrez_build_protChange.bed' #Write a util to get latest dir
process_json_to_bed(input_directory, output_bed_file)
9 changes: 6 additions & 3 deletions pipeline/convert_step2/liftover/2_liftover.sh
mariacuria marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#!/bin/bash

# Load paths from config.json using jq
config_file="/path/to/config.json" # Replace with the actual path to your config.json
generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $config_file)
# Get this script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Load config.json
CONFIG_FILE="$SCRIPT_DIR/../../config.json"

generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $CONFIG_FILE)
liftover_dir="${generated_datasets}/2024_10_22/liftover"

# Extract rows with GRCh38 and save as tab-separated:
Expand Down
6 changes: 4 additions & 2 deletions pipeline/download_step1/cbioportal/fetch_mutations_old.sh
mariacuria marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/bin/bash

# Load paths from config.json
CONFIG_FILE="/path/to/config.json"
# Get this script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Load config.json
CONFIG_FILE="$SCRIPT_DIR/../../config.json"
DOWNLOADS_DIR=$(jq -r '.relevant_paths.generated_datasets' "$CONFIG_FILE")

# Today's date
Expand Down
2 changes: 1 addition & 1 deletion pipeline/download_step1/cbioportal/find_incomplete.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def find_incomplete_json_files(directory, output_file="incomplete_files.txt"):
print(f"Found {len(incomplete_files)} incomplete files. Results saved to '{output_file}'.")

# Load config.json
config_path = Path(__file__).resolve().parent.parent / "config.json"
config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
with open(config_path, "r") as config_file:
config = json.load(config_file)

Expand Down
12 changes: 8 additions & 4 deletions pipeline/download_step1/cbioportal/integrate_cancer_types.sh
mariacuria marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#!/bin/bash

# Load the config file to dynamically retrieve paths
config_file="path/to/config.json"

# Get this script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Load config.json
CONFIG_FILE="$SCRIPT_DIR/../../config.json"


# Get paths from config
input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$config_file")
output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$config_file")
input_dir=$(jq -r '.relevant_paths.downloads + "/cbioportal/2024_10_21/cancer_types"' "$CONFIG_FILE")
output_dir=$(jq -r '.relevant_paths.generated_datasets + "/2024_10_22"' "$CONFIG_FILE")

# Define the output files
output_file="$output_dir/cancer_type_per_study.json"
Expand Down