Skip to content

Commit

Permalink
DSERV-81-footprint-assay-term-name (#78)
Browse files Browse the repository at this point in the history
  • Loading branch information
mingjiecn authored Feb 9, 2023
1 parent 9236bcb commit 4743ef2
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 20 deletions.
38 changes: 23 additions & 15 deletions genomic_data_service/region_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
RegionIndexerElasticSearch,
)
import requests
from requests.adapters import HTTPAdapter, Retry
import pickle
from genomic_data_service.constants import FILE_HG19
import argparse
Expand Down Expand Up @@ -123,6 +124,7 @@
'https://www.encodeproject.org/search/?control_type!=*&status=released&perturbed=false&assay_title=Histone+ChIP-seq&target.label=H3K27ac&target.label=H3K36me3&target.label=H3K4me3&target.label=H3K4me1&target.label=H3K27me3&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&assembly=GRCh38&files.file_type=bed+narrowPeak&type=Experiment&files.analyses.status=released&files.preferred_default=true&limit=all&format=json'
+ '&field=files.accession&field=files.preferred_default&field=files.file_format&field=files.analyses.@id&field=default_analysis'
)

parser = argparse.ArgumentParser(
description='indexing files for genomic data service.'
)
Expand All @@ -149,6 +151,11 @@
choices=['RegulomeDB_2_0', 'RegulomeDB_2_1'],
)

session = requests.Session()
retries = Retry(total=5, backoff_factor=1,
status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))


def clean_up(obj, fields):
clean_obj = {}
Expand Down Expand Up @@ -185,7 +192,7 @@ def encode_graph(query):
query += ['field=*', 'limit=all', 'format=json']

endpoint = f"{ENCODE_DOMAIN}/search/?{'&'.join(query)}"
return requests.get(endpoint).json()['@graph']
return session.get(endpoint).json()['@graph']


def need_to_fetch_documents(dataset):
Expand Down Expand Up @@ -218,7 +225,7 @@ def fetch_documents(dataset):
documents = []
for document_id in dataset.get('documents', []):
endpoint = f'{ENCODE_DOMAIN}{document_id}?format=json'
documents.append(requests.get(endpoint).json())
documents.append(session.get(endpoint).json())

dataset['documents'] = documents

Expand Down Expand Up @@ -329,38 +336,39 @@ def make_pickle_file(encode_accessions):
def get_encode_accessions_from_portal():
encode_accessions = []
# get files in experiment TF ChIP-seq using assembly GRCh38
experiments = requests.get(
experiments = session.get(
TF_CHIP_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph']
# get files in experiment DNase-seq using assembly GRCh38
experiments.extend(requests.get(
experiments.extend(session.get(
DNASE_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph'])
# get files in experiment ATAC-seq using assembly GRCh38
experiments.extend(requests.get(
experiments.extend(session.get(
ATAC_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph'])
# get files in experiment histone ChIP-seq using assembly GRCh38
experiments.extend(requests.get(
experiments.extend(session.get(
HISTONE_CHIP_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph'])
# get files in footprints
annotations = requests.get(
annotations = session.get(
FOOTPRINT_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph']
# get files in PWMs
annotations.extend(requests.get(
annotations.extend(session.get(
PWM_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph'])
# get files in eQTLs
annotations.extend(requests.get(
annotations.extend(session.get(
EQTL_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph'])
# get files for chromatin state for grch38
chromatin_state_files = requests.get(
chromatin_state_files = session.get(
CHROMATIN_STATE_FILES_GRCH38_ENDPOINT).json()['@graph']
# get ds_qtl annotations for grch38
ds_qtls = requests.get(CAQTL_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph']
ds_qtls = session.get(CAQTL_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph']

for experiment in experiments:
files = experiment.get('files', [])
default_analysis_id = experiment['default_analysis']
for file in files:
if is_preferred_default_bed_from_default_analysis(default_analysis_id, file):
encode_accessions.append(file['accession'])
default_analysis_id = experiment.get('default_analysis')
if default_analysis_id:
for file in files:
if is_preferred_default_bed_from_default_analysis(default_analysis_id, file):
encode_accessions.append(file['accession'])

for annotation in annotations:
files = annotation.get('files', [])
Expand Down
13 changes: 8 additions & 5 deletions genomic_data_service/region_indexer_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,16 +423,19 @@ def metadata_doc(file_uuid, file_metadata, dataset_metadata):
assay_title = dataset_metadata.get('assay_title')
if assay_title == 'Histone ChIP-seq':
meta_doc['dataset']['collection_type'] = assay_title
# footprints have both assay_term_name(a list) and annotation_type
elif dataset_metadata.get('annotation_type') == 'footprints':
meta_doc['dataset']['collection_type'] = 'footprints'
if dataset_metadata.get('assay_term_name') and 'ATAC-seq' in dataset_metadata.get('assay_term_name'):
meta_doc['dataset']['footprint_assay_term_name'] = 'ATAC-seq'
else:
meta_doc['dataset']['footprint_assay_term_name'] = 'DNase-seq'
assay_term_name = dataset_metadata.get('assay_term_name')
if assay_term_name:
if 'ATAC-seq' in assay_term_name:
meta_doc['dataset']['footprint_assay_term_name'] = 'ATAC-seq'
elif 'DNase-seq' in assay_term_name:
meta_doc['dataset']['footprint_assay_term_name'] = 'DNase-seq'
else:
# regulome use three type of datasets: experiments, annotations and references. experiements has property assay_term_name, annotations has property annotation_type, references has property reference_type.
# Those properties will be indexed as dataset collection_type in regulome datase base.
# Annotations can have both assay_term_name and annotation_type, for example, imputations and gkm-SVMs, but we don't use those datasets in regulome.
# Annotations can have both assay_term_name and annotation_type, for example, footprints, imputations and gkm-SVMs, we have footprints in regulome.
for prop in REGULOME_COLLECTION_TYPES:
prop_value = dataset_metadata.get(prop)
if prop_value:
Expand Down

0 comments on commit 4743ef2

Please sign in to comment.