Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get rid of Hardcoded paths #23

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
generated_datasets
pipeline/convert_step2/liftover/annotations.pkl
pipeline/convert_step2/liftover/Homo_sapiens.GRCh38.113.gff3
utils/__pycache__/
.*/
../downloads
*.log
Expand Down
4 changes: 3 additions & 1 deletion pipeline/config.json
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"/data/shared/repos/biomuta-old/downloads" is a symlink to "/data/shared/biomuta/downloads", and "/data/shared/repos/biomuta-old/generated_datasets" is a symlink to "/data/shared/biomuta/generated/datasets".

Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
{
"relevant_paths": {
"repos_generated_datasets": "/data/shared/repos/biomuta-old/generated_datasets",
"repos_downloads": "/data/shared/repos/biomuta-old/downloads",
"downloads": "/data/shared/biomuta/downloads",
"generated_datasets": "/data/shared/biomuta/generated/datasets",
"mapping": "/data/shared/biomuta/pipeline/convert_step2/mapping"
},
"resource_init": {
"cbioportal": ["subfolder1", "subfolder2"]
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
from pathlib import Path
from typing import Optional

# Configure logging
logging.basicConfig(filename="cancer_mapping.log",
filemode='a',
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO)

# Logging levels
# 1. error
# 2. warning
Expand All @@ -27,24 +27,27 @@
#from utils import ROOT_DIR #__init__.py, ROOT_DIR is a global var

# Define paths

# Get the directory of this script
script_dir = Path(__file__).resolve().parent
# Navigate to config.json location relative to script
config_dir = script_dir.parent.parent
# Load config
with open(config_dir/'config.json') as config_file:

# Load config.json
with open(config_dir / 'config.json') as config_file:
config = json.load(config_file)

# Access paths from config

mapping_dir = Path(config["relevant_paths"]["mapping"])
doid_mapping = mapping_dir / "combined_do_mapping.json"
fallback_do_map = mapping_dir / "fallback_cbio_doid_mapping.json"

# Input and output file names
# Get the latest directory
directory_path = Path(config["relevant_paths"]["generated_datasets"])
latest_dir = max([d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))], key=lambda d: os.path.getctime(os.path.join(directory_path, d)))
latest_dir = Path(directory_path) / latest_dir
# Get the latest directory in generated_datasets
generated_datasets_dir = Path(config["relevant_paths"]["generated_datasets"])
latest_dir = max([d for d in os.listdir(generated_datasets_dir) if os.path.isdir(os.path.join(generated_datasets_dir, d))], key=lambda d: os.path.getctime(os.path.join(generated_datasets_dir, d)))
latest_dir = Path(generated_datasets_dir) / latest_dir

def ask_confirmation(prompt):
while True:
user_input = input(f"{prompt} (y/n): ").strip().lower()
Expand All @@ -54,11 +57,12 @@ def ask_confirmation(prompt):
return False
else:
print(f"Invalid input. Please enter 'y' for yes or 'n' for no.")

if ask_confirmation(f"The latest created directory is: {latest_dir}. Proceed?"):
input_file = Path(latest_dir) / "unique_cancer_names.txt"
cancer_types_with_do = Path(latest_dir) / "cancer_types_with_do.json"
cancer_type_per_study = Path(latest_dir) / "cancer_type_per_study.json"
study_ids_with_do = Path(latest_dir) / "study_ids_with_do.json"
input_file = latest_dir / "unique_cancer_names.txt"
cancer_types_with_do = latest_dir / "cancer_types_with_do.json"
cancer_type_per_study = latest_dir / "cancer_type_per_study.json"
study_ids_with_do = latest_dir / "study_ids_with_do.json"
print(f"Using {latest_dir}/unique_cancer_names.txt and writing out to {latest_dir}/cancer_types_with_do.json")
else:
sys.exit("Aborted by user.")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
#!/bin/bash

# Get this script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Load config.json
CONFIG_FILE="$SCRIPT_DIR/../../config.json"
repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$CONFIG_FILE")

# Input and output file paths
input_csv="/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv" # Input CSV file
output_json="/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json" # Output JSON file
input_csv="$repos_generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv" # Input CSV file
output_json="$repos_generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json" # Output JSON file

batch_size=5000 # Number of ENSP IDs per batch (adjustable)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
import json
import pandas as pd
from pathlib import Path

# Load config.json
config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
with open(config_path, "r") as config_file:
config = json.load(config_file)

# Retrieve paths from updated config
repos_generated_datasets = Path(config["relevant_paths"]["repos_generated_datasets"])
repos_downloads = Path(config["relevant_paths"]["repos_downloads"])
isoform_data_path = repos_downloads / "glygen/human_protein_masterlist.csv"
ensp_to_uniprot_path = repos_generated_datasets / "2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json"
canonical_toy_output_path = repos_generated_datasets / "2024_10_22/mapping_ids/canonical_toy.json"

# Load the ENSP to UniProt mapping JSON
with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json", "r") as f:
with open(ensp_to_uniprot_path, "r") as f:
ensp_to_uniprot = json.load(f)

# Load the isoform data CSV
isoform_data = pd.read_csv("/data/shared/repos/biomuta-old/downloads/glygen/human_protein_masterlist.csv")
isoform_data = pd.read_csv(isoform_data_path)

# Prepare a dictionary to store the results
result = {}
Expand All @@ -19,6 +32,9 @@ def strip_suffix(identifier):

# Iterate over each ENSP and its corresponding UniProt ID
for ensp, uniprot in ensp_to_uniprot.items():
# Default to "no" for canonical until proven otherwise
is_canonical = "no"

# Check for matching UniProt IDs in either reviewed_isoforms or unreviewed_isoforms
for _, entry in isoform_data.iterrows():
# Strip suffixes from isoform IDs before comparison
Expand All @@ -29,10 +45,15 @@ def strip_suffix(identifier):
if uniprot == reviewed_isoforms or uniprot == unreviewed_isoforms:
# If a match is found, add the uniprotkb_canonical_ac to the result
uniprotkb_ac = strip_suffix(entry.get("uniprotkb_canonical_ac"))
# Store the first match found for each ENSP identifier
result[ensp] = uniprotkb_ac

# Check if the UniProt ID matches the canonical version
if uniprot == uniprotkb_ac:
is_canonical = "yes"

# Store the result with canonical status
result[ensp] = {"uniprotkb_canonical_ac": uniprotkb_ac, "canonical": is_canonical}
break # Exit inner loop once the first match is found

# Write the result to a JSON file
with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical.json", "w") as json_file:
with open(canonical_toy_output_path, "w") as json_file:
json.dump(result, json_file, indent=4)
47 changes: 47 additions & 0 deletions pipeline/convert_step2/cbioportal/4_compare_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import requests
import json
from pathlib import Path

# Load config.json
config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
with open(config_path, "r") as config_file:
config = json.load(config_file)

# Retrieve paths from config.json
repos_base = Path(config["relevant_paths"]["repos_generated_datasets"])
ensembl_uniprot_map_path = repos_base / "2024_10_22/mapping_ids/canonical_toy.json"

# Load your JSON file containing ENSEMBL to UniProt mappings
with open(ensembl_uniprot_map_path, "r") as file:
ensembl_uniprot_map = json.load(file)

def fetch_ensembl_sequence(ensembl_id): # Fetch ENSEMBL sequence
url = f"https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/plain"
response = requests.get(url)
if response.status_code == 200:
return response.text.strip()
else:
print(f"Failed to fetch ENSEMBL sequence for {ensembl_id}")
return None

def fetch_uniprot_sequence(uniprot_id): # Fetch UniProt sequence
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
response = requests.get(url)
if response.status_code == 200:
return response.text.split('\n', 1)[1].replace('\n', '')
else:
print(f"Failed to fetch UniProt sequence for {uniprot_id}")
return None

# Compare sequences
for ensembl_id, uniprot_id in ensembl_uniprot_map.items():
ensembl_sequence = fetch_ensembl_sequence(ensembl_id)
uniprot_sequence = fetch_uniprot_sequence(uniprot_id)

if ensembl_sequence and uniprot_sequence:
if ensembl_sequence == uniprot_sequence:
print(f"Sequences match for {ensembl_id} and {uniprot_id}")
else:
print(f"Sequences do not match for {ensembl_id} and {uniprot_id}")
else:
print(f"Could not compare sequences for {ensembl_id} and {uniprot_id}")
12 changes: 12 additions & 0 deletions pipeline/convert_step2/cbioportal/compare_fasta_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import requests

def fetch_uniprot_sequence(uniprot_id):
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
response = requests.get(url)
if response.status_code == 200:
return response.text.split('\n', 1)[1].replace('\n', '')
else:
print(f"Failed to fetch UniProt sequence for {uniprot_id}")
return None

print(fetch_uniprot_sequence('Q9NXB0'))
40 changes: 22 additions & 18 deletions pipeline/convert_step2/liftover/1_chr_pos_to_bed.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please revert to using utils and call get_config() to get to the config file? I got this method from Sean, it allows me to avoid writing "with open <...> json.load" in every script.

Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,27 @@
#GRCh37: 30925146
#NA: 3143
#GRCh38: 6253932

import os
import json
import glob
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parent.parent.parent.parent))
from utils import ROOT_DIR
from utils.config import get_config
# Load config.json directly and ensure paths are resolved correctly
CONFIG_FILE = Path(__file__).resolve().parent.parent.parent / "config.json"
if not CONFIG_FILE.exists():
raise FileNotFoundError(f"Config file not found at {CONFIG_FILE}")

with open(CONFIG_FILE, "r") as config_file:
config = json.load(config_file)

# Retrieve paths from config
downloads_dir = Path(config["relevant_paths"]["downloads"])
generated_datasets_dir = Path(config["relevant_paths"]["generated_datasets"])

# Define input and output directories
input_directory = downloads_dir / "cbioportal" / "2024_10_21" / "mutations"
output_bed_file = generated_datasets_dir / "2024_10_22" / "liftover" / "hg19entrez_build_protChange.bed"

def process_json_to_bed(input_directory, output_bed_file):
buffer = []
Expand All @@ -49,26 +59,25 @@ def process_json_to_bed(input_directory, output_bed_file):
# Check genome build and SNP criteria
if record.get('ncbiBuild') == 'NA':
continue
if record.get('variantType') != 'SNP': #take SNPs only
if record.get('variantType') != 'SNP': # Take SNPs only
continue
if record['endPosition'] - record['startPosition'] != 0: #additional check to confirm SNP
if record['endPosition'] - record['startPosition'] != 0: # Additional check to confirm SNP
continue
if 'splice' in record.get('proteinChange', ''): #no splice site mutations
if 'splice' in record.get('proteinChange', ''): # No splice site mutations
continue

# Extract chromosome, start position, and end position

chr_ = record['chr']
# Convert specific chromosome values and exclude unwanted chromosomes
if chr_ == '23':
if chr_ == '23':
chr_ = 'X'
if chr_ == '24':
chr_ = 'Y'
if chr_ in ['MT', 'NA']:
continue # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values
continue # Skip records with 'MT' (mitochondrial) or 'NA' as chromosome values
if not chr_.startswith('chr'):
chr_ = 'chr' + chr_
start_pos = record['startPosition'] - 1 # Convert to 0-based for BED
start_pos = record['startPosition'] - 1 # Convert to 0-based for BED
end_pos = record['endPosition']
entrez_gene_id = record['entrezGeneId']
ncbi_build = record['ncbiBuild']
Expand All @@ -77,10 +86,10 @@ def process_json_to_bed(input_directory, output_bed_file):
row = f"{chr_}\t{start_pos}\t{end_pos}\t{entrez_gene_id}\t{ncbi_build}\t{protein_change}\n"
if row not in unique_rows: # Only add unique rows
unique_rows.add(row)
buffer.append(row) # Append row to buffer
buffer.append(row) # Append row to buffer

# Write buffer to file when it reaches the specified size
if len(buffer) >= buffer_size:
if len(buffer) >= buffer_size:
bed_file.writelines(buffer)
buffer.clear()

Expand Down Expand Up @@ -110,9 +119,4 @@ def process_json_to_bed(input_directory, output_bed_file):


# Run the function
config_obj = get_config()
dl_dir = Path(config_obj["relevant_paths"]["downloads"])
out_dir = Path(config_obj["relevant_paths"]["generated_datasets"])
input_directory = dl_dir / 'cbioportal' / '2024_10_21' / 'mutations' # Write a util to get latest dir
output_bed_file = out_dir / '2024_10_22' / 'liftover' / 'hg19entrez_build_protChange.bed' #Write a util to get latest dir
process_json_to_bed(input_directory, output_bed_file)
24 changes: 15 additions & 9 deletions pipeline/convert_step2/liftover/2_liftover.sh
mariacuria marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,25 +1,31 @@
#!/bin/bash

wd="/data/shared/biomuta/generated/datasets/2024_10_22/liftover"
# Get this script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Load config.json
CONFIG_FILE="$SCRIPT_DIR/../../config.json"

generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $CONFIG_FILE)
liftover_dir="${generated_datasets}/2024_10_22/liftover"

# Extract rows with GRCh38 and save as tab-separated:
awk '$5 == "GRCh38"' ${wd}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${wd}/cbio_hg38.bed
awk '$5 == "GRCh38"' ${liftover_dir}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${liftover_dir}/cbio_hg38.bed
# Check if the extraction and modification were successful
if [ $? -ne 0 ]; then
echo "Error extracting or modifying rows for GRCh38."
exit 1
fi

# Extract all other rows (where the 5th column is not GRCh38) and save as tab-separated:
awk '$5 != "GRCh38"' ${wd}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${wd}/hg19entrez_protChange.bed
awk '$5 != "GRCh38"' ${liftover_dir}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${liftover_dir}/hg19entrez_protChange.bed
# Check if the extraction and modification were successful
if [ $? -ne 0 ]; then
echo "Error extracting or modifying non-GRCh38 rows."
exit 1
fi

# Run liftOver to convert coordinates (first chain)
./liftOver ${wd}/hg19entrez_protChange.bed ucscHg19ToHg38.over.chain ${wd}/ucsc_hg38entrez_protChange.bed ${wd}/ucsc_unmapped_entrez_protChange.bed
./liftOver ${liftover_dir}/hg19entrez_protChange.bed ucscHg19ToHg38.over.chain ${liftover_dir}/ucsc_hg38entrez_protChange.bed ${liftover_dir}/ucsc_unmapped_entrez_protChange.bed

# Check if the first liftOver was successful
if [ $? -ne 0 ]; then
Expand All @@ -28,7 +34,7 @@ if [ $? -ne 0 ]; then
fi

# Run liftOver to convert coordinates (second chain)
./liftOver ${wd}/ucsc_unmapped_entrez_protChange.bed ensembl_GRCh37_to_GRCh38.chain ${wd}/ensembl_hg38entrez_protChange.bed ${wd}/ensembl_unmapped_entrez_protChange.bed
./liftOver ${liftover_dir}/ucsc_unmapped_entrez_protChange.bed ensembl_GRCh37_to_GRCh38.chain ${liftover_dir}/ensembl_hg38entrez_protChange.bed ${liftover_dir}/ensembl_unmapped_entrez_protChange.bed

# Check if the second liftOver was successful
if [ $? -ne 0 ]; then
Expand All @@ -37,14 +43,14 @@ if [ $? -ne 0 ]; then
fi

# Prepend 'chr' to the 1st column of ensembl_hg38entrez_protChange.bed
sed 's/^\([a-zA-Z0-9]*\)/chr\1/' ${wd}/ensembl_hg38entrez_protChange.bed > ${wd}/temp && mv ${wd}/temp ${wd}/ensembl_hg38entrez_protChange.bed
sed 's/^\([a-zA-Z0-9]*\)/chr\1/' ${liftover_dir}/ensembl_hg38entrez_protChange.bed > ${liftover_dir}/temp && mv ${liftover_dir}/temp ${liftover_dir}/ensembl_hg38entrez_protChange.bed

# Combine all hg38 files
cat ${wd}/cbio_hg38.bed ${wd}/ucsc_hg38entrez_protChange.bed ${wd}/ensembl_hg38entrez_protChange.bed > ${wd}/hg38_combined.bed
cat ${liftover_dir}/cbio_hg38.bed ${liftover_dir}/ucsc_hg38entrez_protChange.bed ${liftover_dir}/ensembl_hg38entrez_protChange.bed > ${liftover_dir}/hg38_combined.bed
# Remove duplicate rows taking into account extra tabs
awk -v OFS='\t' '{$1=$1; print}' ${wd}/hg38_combined.bed | sort -u > ${wd}/temp && mv ${wd}/temp ${wd}/hg38_combined.bed
awk -v OFS='\t' '{$1=$1; print}' ${liftover_dir}/hg38_combined.bed | sort -u > ${liftover_dir}/temp && mv ${liftover_dir}/temp ${liftover_dir}/hg38_combined.bed
# Add headers with tab separation
sed -i '1i chr_id\tstart_pos\tend_pos\tentrez_gene_id\tprot_change' ${wd}/hg38_combined.bed
sed -i '1i chr_id\tstart_pos\tend_pos\tentrez_gene_id\tprot_change' ${liftover_dir}/hg38_combined.bed

echo "Script completed successfully."

Expand Down
Loading