Skip to content

Commit

Permalink
Merge pull request #3 from UW-GAC/feature/n-files
Browse files Browse the repository at this point in the history
Allow the user to specify the number of files instead of the manifest
  • Loading branch information
amstilp authored Sep 24, 2024
2 parents bb65ee6 + 615a0dd commit a5aa41a
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 11 deletions.
34 changes: 27 additions & 7 deletions fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,23 @@ def __init__(self, ngc_file, prefetch, output_dir="dbgap_fetch_output"):
self.prefetch = os.path.abspath(prefetch)
self.output_dir = os.path.abspath(output_dir)

def download_files(self, cart, manifest, n_retries=3, untar=False):
def download_files(self, cart, manifest=None, n_files=None, n_retries=3, untar=False):
"""Download files from dbGaP using a kart file. Because prefetch sometimes fails to download a file
but does not report an error, this method will retry downloading the cart a number of times.
Args:
cart (str): The path to the cart file.
manifest (str): The path to the manifest file.
n_retries (int): The number of times to retry downloading the cart.
Optional arguments:
manifest (str): The path to the manifest file. If this is provided, the method will check
that each file was successfully downloaded.
n_files (int): The number of files that should be downloaded. If this is provided, the method will
check that this many files were downloaded.
"""
# Work in a temporary directory to do the downloading.
cart_file = os.path.abspath(cart)
original_working_directory = os.getcwd()
manifest_files = self._read_manifest(manifest)
with tempfile.TemporaryDirectory() as temp_dir:
os.chdir(temp_dir)
# Download the files
Expand All @@ -48,7 +52,13 @@ def download_files(self, cart, manifest, n_retries=3, untar=False):
if i == n_retries:
print("Failed to download all files.")
return False
all_files_downloaded = self._check_prefetch(temp_dir, manifest_files)
if manifest:
all_files_downloaded = self._check_prefetch_against_manifest(temp_dir, manifest)
elif n_files:
all_files_downloaded = self._check_prefetch_against_n_files(temp_dir, n_files)
else:
# If no manifest or n_files was provided, we have to assume that it worked.
all_files_downloaded = True
i = i + 1
if untar:
self._untar(temp_dir)
Expand Down Expand Up @@ -86,11 +96,18 @@ def _run_prefetch(self, cart_file):
returned_value = subprocess.call(cmd, shell=True)
return returned_value

def _check_prefetch(self, directory, expected_files):
def _check_prefetch_against_manifest(self, directory, manifest):
"""Check that prefetch downloaded all the files in the manifest."""
expected_files = self._read_manifest(manifest)
downloaded_files = os.listdir(directory)
return set(downloaded_files) == set(expected_files)

def _check_prefetch_against_n_files(self, directory, n_files):
"""Check that prefetch downloaded the expected number of files."""
downloaded_files = os.listdir(directory)
print(downloaded_files)
return len(downloaded_files) == n_files

def _untar(self, directory):
"""Untar all tar files in the directory."""
print("Untarring files.")
Expand All @@ -115,8 +132,11 @@ def _untar(self, directory):
# Required arguments.
parser.add_argument("--ngc", help="The path to the ngc file containing the project key.", required=True)
parser.add_argument("--cart", help="The cart file to use.", required=True)
parser.add_argument("--manifest", help="The manifest file to use.", required=True)
parser.add_argument("--outdir", help="The directory where files should be saved.", required=True)
# Files for downloading.
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--manifest", help="The manifest file to use.", type=str)
group.add_argument("--n-files", help="The number of files expected to download.", type=int)
# Optional arguments.
parser.add_argument("--prefetch", help="The path to the prefetch executable.", default="prefetch")
parser.add_argument("--verbose", help="Print more information.", action="store_true")
Expand All @@ -128,7 +148,7 @@ def _untar(self, directory):
# Set up the class.
fetcher = dbGaPFileFetcher(args.ngc, args.prefetch, args.outdir)
# Download.
files_downloaded = fetcher.download_files(args.cart, args.manifest, untar=args.untar)
files_downloaded = fetcher.download_files(args.cart, manifest=args.manifest, n_files=args.n_files, untar=args.untar)
if not files_downloaded:
sys.exit(1)
else:
Expand Down
12 changes: 8 additions & 4 deletions fetch_dbgap_files.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@ version 1.0
workflow fetch_dbgap_files {
input {
File cart_file
File manifest_file
File ngc_file
String output_directory
File? manifest_file
Int? n_files
Int? disk_gb
}

call fetch_files {
input:
cart_file=cart_file,
manifest_file=manifest_file,
n_files=n_files,
ngc_file=ngc_file,
output_directory=output_directory,
disk_gb=disk_gb
Expand All @@ -29,24 +31,26 @@ workflow fetch_dbgap_files {
task fetch_files {
input {
File cart_file
File manifest_file
File ngc_file
String output_directory
Int disk_gb = 50
File? manifest_file
Int? n_files
}
command {
python3 /usr/local/fetch-dbgap-files/fetch.py \
--prefetch /opt/sratoolkit.3.0.10-ubuntu64/bin/prefetch \
--ngc ~{ngc_file} \
--cart ~{cart_file} \
--manifest ~{manifest_file} \
~{"--manifest " + manifest_file} \
~{"--n-files " + n_files} \
--outdir tmp_download \
--untar
gsutil -m cp -r tmp_download/* ~{output_directory}
}
runtime {
# Pull from DockerHub
docker: "uwgac/fetch-dbgap-files:0.1.0"
docker: "uwgac/fetch-dbgap-files:0.2.0"
disks: "local-disk ${disk_gb} SSD"
}
}

0 comments on commit a5aa41a

Please sign in to comment.