From c48008c8f4964016c56d8272725238a61f6aaf55 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Tue, 25 Jun 2019 13:21:24 -0700 Subject: [PATCH 1/4] 1) added short param for --file-db (-d) and --no-file-db (-n), 2) fix bug for --use-netrc not working with submit command, 3) added minimum required parameter section to default conf file (so that users only need to edit params in that section) --- caper/caper_args.py | 105 ++++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 43 deletions(-) diff --git a/caper/caper_args.py b/caper/caper_args.py index 16899773..bb1ead0b 100644 --- a/caper/caper_args.py +++ b/caper/caper_args.py @@ -28,11 +28,53 @@ DEFAULT_DEEPCOPY_EXT = 'json,tsv' DEFAULT_CAPER_CONF_CONTENTS = """[defaults] -############# Caper settings +############ Minimum required parameters +## Please read through carefully + +## Define file DB to use Cromwell's call-caching +## Call-caching is important for restarting failed workflows +## File DB can only be accessed by one caper process (caper run or server) +## i.e. you cannot run multiple caper run with one file DB +## For such case, we recommend to use caper server and submit multiple workflows to it +## You can disable file DB with '--no-file-db' or '-n' +#file-db=~/.caper/default_file_db -## default backend +## Define to use 'caper server' and all client subcommands like 'caper submit' +## This is not required for 'caper run' +#port=8000 + +## Define default backend (local, gcp, aws, slurm, sge, pbs) #backend=local +## Define output directory if you want to run pipelines locally +#out-dir= + +## Define if you want to run pipelines on Google Cloud Platform +#gcp-prj=encode-dcc-1016 +#out-gcs-bucket=gs://encode-pipeline-test-runs/caper_out_project1 + +## Define if you want to run pipelines on AWS +#aws-batch-arn=arn:.... +#aws-region=us-west-1 +#out-s3-bucket=s3://encode-pipeline-test-runs/caper_out_project1 + +## Define if you want to run pipelines on SLURM +## Define partition or account or both according to your cluster's requirements +## For example, Stanford requires a partition and SCG requires an account. +#slurm-partition=akundaje +#slurm-account=akundaje + +## Define if you want to run pipelines on SGE +#sge-pe=shm + +## Define if your SGE cluster requires a queue +#sge-queue=q + +## Define if your PBS cluster requires a queue +#pbs-queue=q + + +############# Caper settings ## cromwell.jar java heap ## java -Xmx for "caper server" #java-heap-server=5G @@ -40,15 +82,7 @@ ## java -Xmx for "caper run" #java-heap-run=1G -## Put a hold on submitted jobs. -## You need to run "caper unhold [WORKFLOW_ID]" to release hold -#hold=True - ### Workflow settings -## deepcopy recursively all file URIs in a file URI -## with supported extensions (json,tsv,csv) -## to a target remote/local storage -#deepcopy=True #deepcopy-ext=json,tsv ############# local backend @@ -64,26 +98,17 @@ ## trying to write on the same image file. #no-build-singularity=True -## actual workflow outputs will be written to -## out-dir/[WDL_NAME]/[WORKFLOW_ID]/ -#out-dir= - ## all temporary files (including deepcopied data files, -## cromwell.jar, backend conf, worflow_opts JSONs, ...) +## backend conf, worflow_opts JSONs, ...) ## will be written to this directory ## DON'T USE /tmp. User's scratch directory is recommended #tmp-dir= ############# Google Cloud Platform backend -#gcp-prj=encode-dcc-1016 -#out-gcs-bucket=gs://encode-pipeline-test-runs/caper/out -#tmp-gcs-bucket=gs://encode-pipeline-test-runs/caper/tmp +#tmp-gcs-bucket= ############# AWS backend -#aws-batch-arn= -#aws-region=us-west-1 -#out-s3-bucket=s3://encode-pipeline-test-runs/caper/out -#tmp-s3-bucket=s3://encode-pipeline-test-runs/caper/tmp +#tmp-s3-bucket= ## gsutil can work with s3 buckets it outperforms over aws s3 CLI #use-gsutil-over-aws-s3=True @@ -102,13 +127,9 @@ ## machine www.encodeproject.org ## login ZSFDUCEJ ## password YOUR_PASSWORD - #use-netrc=True ############# Cromwell's built-in HyperSQL database settings -## DB file prefix path -#file-db=~/.caper/default_file_db - ## disable file-db ## Detach DB from Cromwell ## you can run multiple workflows with 'caper run' command @@ -124,7 +145,7 @@ #mysql-db-password=cromwell ############# Cromwell general settings -#cromwell=https://github.com/broadinstitute/cromwell/releases/download/40/cromwell-40.jar +#cromwell=https://github.com/broadinstitute/cromwell/releases/download/42/cromwell-42.jar #max-concurrent-tasks=1000 #max-concurrent-workflows=40 #disable-call-caching=False @@ -132,20 +153,14 @@ ## Cromwell server #ip=localhost -#port=8000 ############# SLURM backend -#slurm-partition=akundaje -#slurm-account=akundaje #slurm-extra-param= ############# SGE backend -#sge-queue=q -#sge-pe=shm #sge-extra-param= ############# PBS backend -#pbs-queue=q #pbs-extra-param= ############# Misc. settings @@ -208,10 +223,10 @@ def parse_caper_arguments(): group_file_db = parent_host.add_argument_group( title='HyperSQL file DB arguments') group_file_db.add_argument( - '--file-db', default=DEFAULT_FILE_DB, + '--file-db', '-d', default=DEFAULT_FILE_DB, help='Default DB file for Cromwell\'s built-in HyperSQL database.') group_file_db.add_argument( - '--no-file-db', action='store_true', + '--no-file-db', '-n', action='store_true', help='Disable file DB for Cromwell\'s built-in HyperSQL database. ' 'An in-memory DB will still be available for server mode.') @@ -284,7 +299,8 @@ def parse_caper_arguments(): '--use-gsutil-over-aws-s3', action='store_true', help='Use gsutil instead of aws s3 CLI even for S3 buckets.') - group_http = parent_host.add_argument_group( + parent_http_auth = argparse.ArgumentParser(add_help=False) + group_http = parent_http_auth.add_argument_group( title='HTTP/HTTPS authentication arguments') group_http.add_argument( '--http-user', @@ -464,14 +480,16 @@ def parse_caper_arguments(): p_run = subparser.add_parser( 'run', help='Run a single workflow without server', - parents=[parent_submit, parent_run, parent_host, parent_backend]) + parents=[parent_submit, parent_run, parent_host, parent_backend, + parent_http_auth]) p_server = subparser.add_parser( 'server', help='Run a Cromwell server', - parents=[parent_server_client, parent_server, parent_host, parent_backend]) + parents=[parent_server_client, parent_server, parent_host, + parent_backend, parent_http_auth]) p_submit = subparser.add_parser( 'submit', help='Submit a workflow to a Cromwell server', parents=[parent_server_client, parent_submit, - parent_backend]) + parent_backend, parent_http_auth]) p_abort = subparser.add_parser( 'abort', help='Abort running/pending workflows on a Cromwell server', parents=[parent_server_client, parent_search_wf]) @@ -490,7 +508,8 @@ def parse_caper_arguments(): 'troubleshoot', help='Troubleshoot workflow problems from metadata JSON file or ' 'workflow IDs', - parents=[parent_troubleshoot, parent_server_client, parent_search_wf]) + parents=[parent_troubleshoot, parent_server_client, parent_search_wf, + parent_http_auth]) for p in [p_run, p_server, p_submit, p_abort, p_unhold, p_list, p_metadata, p_troubleshoot]: @@ -573,16 +592,16 @@ def parse_caper_arguments(): args_d['out_dir'] = os.getcwd() if args_d.get('tmp_dir') is None: - args_d['tmp_dir'] = os.path.join(args_d['out_dir'], 'caper_tmp') + args_d['tmp_dir'] = os.path.join(args_d['out_dir'], '.caper_tmp') if args_d.get('tmp_s3_bucket') is None: if args_d.get('out_s3_bucket'): args_d['tmp_s3_bucket'] = os.path.join(args_d['out_s3_bucket'], - 'caper_tmp') + '.caper_tmp') if args_d.get('tmp_gcs_bucket') is None: if args_d.get('out_gcs_bucket'): args_d['tmp_gcs_bucket'] = os.path.join(args_d['out_gcs_bucket'], - 'caper_tmp') + '.caper_tmp') file_db = args_d.get('file_db') if file_db is not None: file_db = os.path.abspath(os.path.expanduser(file_db)) From 4835622e2ac64f8df4466d0f22e0b037cca61b75 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Tue, 25 Jun 2019 13:22:33 -0700 Subject: [PATCH 2/4] 1) put staticmethods to the end of Caper class, 2) download cromwell on ~/.caper/cromwell_jar/ not on --- caper/caper.py | 297 ++++++++++++++++++++++++++----------------------- 1 file changed, 155 insertions(+), 142 deletions(-) diff --git a/caper/caper.py b/caper/caper.py index f9d0238c..da21e8c4 100644 --- a/caper/caper.py +++ b/caper/caper.py @@ -60,6 +60,7 @@ class Caper(object): """Cromwell/WDL wrapper """ + CROMWELL_JAR_DIR = '~/.caper/cromwell_jar' BACKEND_CONF_HEADER = 'include required(classpath("application"))\n' DEFAULT_BACKEND = BACKEND_LOCAL RE_PATTERN_BACKEND_CONF_HEADER = r'^\s*include\s' @@ -221,7 +222,7 @@ def run(self): # LOG_LEVEL must be >=INFO to catch workflow ID from STDOUT cmd = ['java', java_heap, '-XX:ParallelGCThreads=1', '-DLOG_LEVEL=INFO', '-jar', '-Dconfig.file={}'.format(backend_file), - CaperURI(self._cromwell).get_local_file(), 'run', + self.__download_cromwell_jar(), 'run', CaperURI(self._wdl).get_local_file(), '-i', input_file, '-o', workflow_opts_file, @@ -289,7 +290,7 @@ def server(self): # LOG_LEVEL must be >=INFO to catch workflow ID from STDOUT cmd = ['java', java_heap, '-XX:ParallelGCThreads=1', '-DLOG_LEVEL=INFO', '-jar', '-Dconfig.file={}'.format(backend_file), - CaperURI(self._cromwell).get_local_file(), 'server'] + self.__download_cromwell_jar(), 'server'] print('[Caper] cmd: ', cmd) # pending/running workflows @@ -475,6 +476,18 @@ def troubleshoot(self): for metadata in metadatas: Caper.__troubleshoot(metadata, self._show_completed_task) + def __download_cromwell_jar(self): + """Download cromwell-X.jar + """ + cu = CaperURI(self._cromwell) + if cu.uri_type == URI_LOCAL: + return cu.get_uri() + + path = os.path.join( + os.path.expanduser(Caper.CROMWELL_JAR_DIR), + os.path.basename(self._cromwell)) + return cu.copy(target_uri=path) + def __write_metadata_jsons(self, workflow_ids): try: for wf_id in workflow_ids: @@ -501,146 +514,6 @@ def __write_metadata_jsons(self, workflow_ids): str(e), workflow_ids) return False - @staticmethod - def __troubleshoot(metadata_json, show_completed_task=False): - """Troubleshoot from metadata JSON obj/file - """ - if isinstance(metadata_json, dict): - metadata = metadata_json - else: - f = CaperURI(metadata_json).get_local_file() - with open(f, 'r') as fp: - metadata = json.loads(fp.read()) - if isinstance(metadata, list): - metadata = metadata[0] - - workflow_id = metadata['id'] - workflow_status = metadata['status'] - print('[Caper] troubleshooting {} ...'.format(workflow_id)) - if not show_completed_task and workflow_status == 'Succeeded': - print('This workflow ran successfully. ' - 'There is nothing to troubleshoot') - return - - def recurse_calls(calls, failures=None, show_completed_task=False): - if failures is not None: - s = json.dumps(failures, indent=4) - print('Found failures:\n{}'.format(s)) - for task_name, call_ in calls.items(): - for call in call_: - # if it is a subworkflow, then recursively dive into it - if 'subWorkflowMetadata' in call: - subworkflow = call['subWorkflowMetadata'] - recurse_calls( - subworkflow['calls'], - subworkflow['failures'] - if 'failures' in subworkflow else None, - show_completed_task) - continue - task_status = call['executionStatus'] - shard_index = call['shardIndex'] - rc = call['returnCode'] if 'returnCode' in call else None - job_id = call['jobId'] if 'jobId' in call else None - stdout = call['stdout'] if 'stdout' in call else None - stderr = call['stderr'] if 'stderr' in call else None - if 'executionEvents' in call: - for ev in call['executionEvents']: - if ev['description'].startswith('Running'): - run_start = ev['startTime'] - run_end = ev['endTime'] - break - else: - run_start = None - run_end = None - - if not show_completed_task and \ - task_status in ('Done', 'Succeeded'): - continue - print('\n{} {}. SHARD_IDX={}, RC={}, JOB_ID={}, ' - 'RUN_START={}, RUN_END={}, ' - 'STDOUT={}, STDERR={}'.format( - task_name, task_status, shard_index, rc, job_id, - run_start, run_end, stdout, stderr)) - - if stderr is not None: - cu = CaperURI(stderr) - if cu.file_exists(): - local_stderr_f = cu.get_local_file() - with open(local_stderr_f, 'r') as fp: - stderr_contents = fp.read() - print('STDERR_CONTENTS=\n{}'.format( - stderr_contents)) - - calls = metadata['calls'] - failures = metadata['failures'] if 'failures' in metadata else None - recurse_calls(calls, failures, show_completed_task) - - @staticmethod - def __find_singularity_bindpath(input_json_file): - """Read input JSON file and find paths to be bound for singularity - by finding common roots for all files in input JSON file - """ - with open(input_json_file, 'r') as fp: - input_json = json.loads(fp.read()) - - # find dirname of all files - def recurse_dict(d, d_parent=None, d_parent_key=None, - lst=None, lst_idx=None): - result = set() - if isinstance(d, dict): - for k, v in d.items(): - result |= recurse_dict(v, d_parent=d, - d_parent_key=k) - elif isinstance(d, list): - for i, v in enumerate(d): - result |= recurse_dict(v, lst=d, - lst_idx=i) - elif type(d) == str: - assert(d_parent is not None or lst is not None) - c = CaperURI(d) - # local absolute path only - if c.uri_type == URI_LOCAL and c.is_valid_uri(): - dirname, basename = os.path.split(c.get_uri()) - result.add(dirname) - - return result - - all_dirnames = recurse_dict(input_json) - # add all (but not too high level<4) parent directories - # to all_dirnames. start from self - # e.g. /a/b/c/d/e/f/g/h with COMMON_ROOT_SEARCH_LEVEL = 5 - # add all the followings: - # /a/b/c/d/e/f/g/h (self) - # /a/b/c/d/e/f/g - # /a/b/c/d/e/f - # /a/b/c/d/e - # /a/b/c/d (minimum level = COMMON_ROOT_SEARCH_LEVEL-1) - all_dnames_incl_parents = set(all_dirnames) - for d in all_dirnames: - dir_arr = d.split(os.sep) - for i, _ in enumerate( - dir_arr[Caper.COMMON_ROOT_SEARCH_LEVEL:]): - d_child = os.sep.join( - dir_arr[:i + Caper.COMMON_ROOT_SEARCH_LEVEL]) - all_dnames_incl_parents.add(d_child) - - bindpaths = set() - # remove overlapping directories - for i, d1 in enumerate(sorted(all_dnames_incl_parents, - reverse=True)): - overlap_found = False - for j, d2 in enumerate(sorted(all_dnames_incl_parents, - reverse=True)): - if i >= j: - continue - if d1.startswith(d2): - overlap_found = True - break - if not overlap_found: - bindpaths.add(d1) - - return ','.join(bindpaths) - def __write_metadata_json(self, workflow_id, metadata_json, backend=None, wdl=None): if backend is None: @@ -1092,6 +965,146 @@ def __get_workflow_ids_from_cromwell_stdout(stdout): def __get_time_str(): return datetime.now().strftime('%Y%m%d_%H%M%S_%f') + @staticmethod + def __troubleshoot(metadata_json, show_completed_task=False): + """Troubleshoot from metadata JSON obj/file + """ + if isinstance(metadata_json, dict): + metadata = metadata_json + else: + f = CaperURI(metadata_json).get_local_file() + with open(f, 'r') as fp: + metadata = json.loads(fp.read()) + if isinstance(metadata, list): + metadata = metadata[0] + + workflow_id = metadata['id'] + workflow_status = metadata['status'] + print('[Caper] troubleshooting {} ...'.format(workflow_id)) + if not show_completed_task and workflow_status == 'Succeeded': + print('This workflow ran successfully. ' + 'There is nothing to troubleshoot') + return + + def recurse_calls(calls, failures=None, show_completed_task=False): + if failures is not None: + s = json.dumps(failures, indent=4) + print('Found failures:\n{}'.format(s)) + for task_name, call_ in calls.items(): + for call in call_: + # if it is a subworkflow, then recursively dive into it + if 'subWorkflowMetadata' in call: + subworkflow = call['subWorkflowMetadata'] + recurse_calls( + subworkflow['calls'], + subworkflow['failures'] + if 'failures' in subworkflow else None, + show_completed_task) + continue + task_status = call['executionStatus'] + shard_index = call['shardIndex'] + rc = call['returnCode'] if 'returnCode' in call else None + job_id = call['jobId'] if 'jobId' in call else None + stdout = call['stdout'] if 'stdout' in call else None + stderr = call['stderr'] if 'stderr' in call else None + if 'executionEvents' in call: + for ev in call['executionEvents']: + if ev['description'].startswith('Running'): + run_start = ev['startTime'] + run_end = ev['endTime'] + break + else: + run_start = None + run_end = None + + if not show_completed_task and \ + task_status in ('Done', 'Succeeded'): + continue + print('\n{} {}. SHARD_IDX={}, RC={}, JOB_ID={}, ' + 'RUN_START={}, RUN_END={}, ' + 'STDOUT={}, STDERR={}'.format( + task_name, task_status, shard_index, rc, job_id, + run_start, run_end, stdout, stderr)) + + if stderr is not None: + cu = CaperURI(stderr) + if cu.file_exists(): + local_stderr_f = cu.get_local_file() + with open(local_stderr_f, 'r') as fp: + stderr_contents = fp.read() + print('STDERR_CONTENTS=\n{}'.format( + stderr_contents)) + + calls = metadata['calls'] + failures = metadata['failures'] if 'failures' in metadata else None + recurse_calls(calls, failures, show_completed_task) + + @staticmethod + def __find_singularity_bindpath(input_json_file): + """Read input JSON file and find paths to be bound for singularity + by finding common roots for all files in input JSON file + """ + with open(input_json_file, 'r') as fp: + input_json = json.loads(fp.read()) + + # find dirname of all files + def recurse_dict(d, d_parent=None, d_parent_key=None, + lst=None, lst_idx=None): + result = set() + if isinstance(d, dict): + for k, v in d.items(): + result |= recurse_dict(v, d_parent=d, + d_parent_key=k) + elif isinstance(d, list): + for i, v in enumerate(d): + result |= recurse_dict(v, lst=d, + lst_idx=i) + elif type(d) == str: + assert(d_parent is not None or lst is not None) + c = CaperURI(d) + # local absolute path only + if c.uri_type == URI_LOCAL and c.is_valid_uri(): + dirname, basename = os.path.split(c.get_uri()) + result.add(dirname) + + return result + + all_dirnames = recurse_dict(input_json) + # add all (but not too high level<4) parent directories + # to all_dirnames. start from self + # e.g. /a/b/c/d/e/f/g/h with COMMON_ROOT_SEARCH_LEVEL = 5 + # add all the followings: + # /a/b/c/d/e/f/g/h (self) + # /a/b/c/d/e/f/g + # /a/b/c/d/e/f + # /a/b/c/d/e + # /a/b/c/d (minimum level = COMMON_ROOT_SEARCH_LEVEL-1) + all_dnames_incl_parents = set(all_dirnames) + for d in all_dirnames: + dir_arr = d.split(os.sep) + for i, _ in enumerate( + dir_arr[Caper.COMMON_ROOT_SEARCH_LEVEL:]): + d_child = os.sep.join( + dir_arr[:i + Caper.COMMON_ROOT_SEARCH_LEVEL]) + all_dnames_incl_parents.add(d_child) + + bindpaths = set() + # remove overlapping directories + for i, d1 in enumerate(sorted(all_dnames_incl_parents, + reverse=True)): + overlap_found = False + for j, d2 in enumerate(sorted(all_dnames_incl_parents, + reverse=True)): + if i >= j: + continue + if d1.startswith(d2): + overlap_found = True + break + if not overlap_found: + bindpaths.add(d1) + + return ','.join(bindpaths) + def main(): # parse arguments From bfbc9cdad8f984820f59ac4fac1db4e2653a89c1 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Tue, 25 Jun 2019 13:22:43 -0700 Subject: [PATCH 3/4] update README --- DETAILS.md | 73 ++++++++++++++++- README.md | 235 ++++++++++++++++++++++++----------------------------- 2 files changed, 177 insertions(+), 131 deletions(-) diff --git a/DETAILS.md b/DETAILS.md index 48a09fe6..1de476c3 100644 --- a/DETAILS.md +++ b/DETAILS.md @@ -1,3 +1,29 @@ +## Important features of Caper + +* **Similar CLI**: Caper has a similar CLI as Cromwell. + +* **Built-in backends**: You don't need your own backend configuration file. Caper provides built-in backends. + +* **Automatic transfer between local/cloud storages**: You can use URIs (e.g. `gs://`, `http(s)://` and `s3://`) instead of paths in a command line arguments, also in your input JSON file. Files associated with these URIs will be automatically transfered to a specified temporary directory on a target remote storage. + +* **Deepcopy for input JSON file**: Recursively copy all data files in (`.json`, `.tsv` and `.csv`) to a target remote storage. Use `--deepcopy` for this feature. + +* **Docker/Singularity integration**: You can run a WDL workflow in a specifed docker/singularity container. + +* **MySQL database integration**: Caper defaults to use Cromwell's built-in HyperSQL DB to store metadata of all workflows. However, we also provide shell scripts to run a MySQL database server in a docker/singularity container. Using Caper with those databases will allow you to use Cromwell's [call-caching](https://cromwell.readthedocs.io/en/develop/Configuring/#call-caching) to re-use outputs from previous successful tasks. This will be useful to resume a failed workflow where it left off. + +* **One configuration file for all**: You may not want to repeat writing same command line parameters for every pipeline run. Define parameters in a configuration file at `~/.caper/default.conf`. + +* **One server for six backends**: Built-in backends allow you to submit pipelines to any local/remote backend specified with `-b` or `--backend`. + +* **Cluster engine support**: SLURM, SGE and PBS are currently supported locally. + +* **Easy workflow management**: Find all workflows submitted to a Cromwell server by workflow IDs (UUIDs) or `str_label` (special label for a workflow submitted by Caper `submit` and `run`). You can define multiple keywords with wildcards (`*` and `?`) to search for matching workflows. Abort, release hold, retrieve metadata JSON for them. + +* **Automatic subworkflow packing**: Caper automatically creates an archive (`imports.zip`) of all imports and send it to Cromwell server/run. + +* **Special label** (`str_label`): You have a string label, specified with `-s` or `--str-label`, for your workflow so that you can search for your workflow by this label instead of Cromwell's workflow UUID (e.g. `f12526cb-7ed8-4bfa-8e2e-a463e94a61d0`). + ## List of parameters We highly recommend to use a default configuration file described in the section [Configuration file](#configuration-file). Note that both dash (`-`) and underscore (`_`) are allowed for key names in a configuration file. @@ -23,8 +49,8 @@ We highly recommend to use a default configuration file described in the section --use-singularity|Use singularity image for all tasks in a workflow --no-build-singularity|Local singularity image will not be built before running/submitting a workflow --singularity-cachedir|Singularity image URI for a WDL - --file-db|DB file for Cromwell's built-in HyperSQL database - --no-file-db|Do not use file-db. Call-caching (re-using outputs) will be disabled + --file-db, -d|DB file for Cromwell's built-in HyperSQL database + --no-file-db, -n|Do not use file-db. Call-caching (re-using outputs) will be disabled * Choose a default backend. Use `--deepcopy` to recursively auto-copy data files in your input JSON file. All data files will be automatically transferred to a target local/remote storage corresponding to a chosen backend. Make sure that you correctly configure temporary directories for source/target storages (`--tmp-dir`, `--tmp-gcs-bucket` and `--tmp-s3-bucket`). @@ -138,6 +164,49 @@ There are six built-in backends for Caper. Each backend must run on its designat |sge |local SGE backend | local | --out-dir, --tmp-dir, --sge-pe | |pds |local PBS backend | local | --out-dir, --tmp-dir | +## Database + +Caper defaults to use Cromwell's built-in HyperSQL file database located at `~/.caper/default_file_db`. You can change default database file path prefix in a default configuration file (`~/.caper/default.conf`). Setting up a database is important for Caper to re-use outputs from previous failed/succeeded workflows. +``` +file-db=[YOUR_FILE_DB_PATH_PREFIX] +``` + +You can also use your own MySQL database if you [configure MySQL for Caper](DETAILS.md/#mysql-server). + +## Singularity + +Caper supports Singularity for its local built-in backend (`local`, `slurm`, `sge` and `pbs`). Tasks in a workflow will run inside a container and outputs will be pulled out to a host from it at the end of each task. Or you can add `--use-singularity` to use a [Singularity image URI defined in your WDL as a comment](DETAILS.md/#wdl-customization). + +```bash +$ caper run [WDL] -i [INPUT_JSON] --singularity [SINGULARITY_IMAGE_URI] +``` + +Define a cache directory where local Singularity images will be built. You can also define an environment variable `SINGULARITY_CACHEDIR`. +``` +singularity-cachedir=[SINGULARITY_CACHEDIR] +``` + +Singularity image will be built first before running a workflow to prevent mutiple tasks from competing to write on the same local image file. If you don't define it, every task in a workflow will try to repeatedly build a local Singularity image on their temporary directory. + + +## Docker + +Caper supports Docker for its non-HPC backends (`local`, `aws` and `gcp`). + +> **WARNING**: AWS and GCP backends will not work without a Docker image URI defined in a WDL file or specified with `--docker`. You can skip adding `--use-docker` since Caper will try to find it in your WDL first. + +Tasks in a workflow will run inside a container and outputs will be pulled out to a host from it at the end of each task. Or you can add `--use-docker` to use a [Docker image URI defined in your WDL as a comment](DETAILS.md/#wdl-customization). + +```bash +$ caper run [WDL] -i [INPUT_JSON] --docker [DOCKER_IMAGE_URI] +``` + +## Conda + +Activate your `CONDA_ENV` before running Caper (both for `run` and `server` modes). +```bash +$ conda activate [COND_ENV] +``` ## MySQL server diff --git a/README.md b/README.md index 8bb5e278..72b29882 100644 --- a/README.md +++ b/README.md @@ -6,32 +6,6 @@ Caper (Cromwell Assisted Pipeline ExecutoR) is a wrapper Python package for [Cro Caper is based on Unix and cloud platform CLIs (`curl`, `gsutil` and `aws`) and provides easier way of running Cromwell server/run modes by automatically composing necessary input files for Cromwell. Also, Caper supports easy automatic file transfer between local/cloud storages (local path, `s3://`, `gs://` and `http(s)://`). You can use these URIs in input JSON file or for a WDL file itself. -## Features - -* **Similar CLI**: Caper has a similar CLI as Cromwell. - -* **Built-in backends**: You don't need your own backend configuration file. Caper provides built-in backends. - -* **Automatic transfer between local/cloud storages**: You can use URIs (e.g. `gs://`, `http(s)://` and `s3://`) instead of paths in a command line arguments, also in your input JSON file. Files associated with these URIs will be automatically transfered to a specified temporary directory on a target remote storage. - -* **Deepcopy for input JSON file**: Recursively copy all data files in (`.json`, `.tsv` and `.csv`) to a target remote storage. Use `--deepcopy` for this feature. - -* **Docker/Singularity integration**: You can run a WDL workflow in a specifed docker/singularity container. - -* **MySQL database integration**: Caper defaults to use Cromwell's built-in HyperSQL DB to store metadata of all workflows. However, we also provide shell scripts to run a MySQL database server in a docker/singularity container. Using Caper with those databases will allow you to use Cromwell's [call-caching](https://cromwell.readthedocs.io/en/develop/Configuring/#call-caching) to re-use outputs from previous successful tasks. This will be useful to resume a failed workflow where it left off. - -* **One configuration file for all**: You may not want to repeat writing same command line parameters for every pipeline run. Define parameters in a configuration file at `~/.caper/default.conf`. - -* **One server for six backends**: Built-in backends allow you to submit pipelines to any local/remote backend specified with `-b` or `--backend`. - -* **Cluster engine support**: SLURM, SGE and PBS are currently supported locally. - -* **Easy workflow management**: Find all workflows submitted to a Cromwell server by workflow IDs (UUIDs) or `str_label` (special label for a workflow submitted by Caper `submit` and `run`). You can define multiple keywords with wildcards (`*` and `?`) to search for matching workflows. Abort, release hold, retrieve metadata JSON for them. - -* **Automatic subworkflow packing**: Caper automatically creates an archive (`imports.zip`) of all imports and send it to Cromwell server/run. - -* **Special label** (`str_label`): You have a string label, specified with `-s` or `--str-label`, for your workflow so that you can search for your workflow by this label instead of Cromwell's workflow UUID (e.g. `f12526cb-7ed8-4bfa-8e2e-a463e94a61d0`). - ## Installation Make sure that you have `python3`(> 3.4.1) installed on your system. Use `pip` to install Caper. @@ -48,7 +22,7 @@ $ echo "export PATH=\"\$PATH:$PWD/caper/bin\"" >> ~/.bashrc ## Usage -There are 7 subcommands available for Caper. Except for `run` other subcommands work with a running Cromwell server, which can be started with `server` subcommand. `server` does not require a positional argument. `WF_ID` (workflow ID) is a UUID generated from Cromwell to identify a workflow. `STR_LABEL` is Caper's special string label to be used to identify a workflow. +There are 7 subcommands available for Caper. Except for `run` other subcommands work with a running Caper server, which can be started with `server` subcommand. `server` does not require a positional argument. `WF_ID` (workflow ID) is a UUID generated from Cromwell to identify a workflow. `STR_LABEL` is Caper's special string label to be used to identify a workflow. **Subcommand**|**Positional args** | **Description** :--------|:-----|:----- @@ -64,17 +38,18 @@ troubleshoot | WF_ID, STR_LABEL or
METADATA_JSON_FILE |Analyze reason for err * `run`: To run a single workflow. A string label `-s` is optional and useful for other subcommands to indentify a workflow. ```bash - $ caper run [WDL] -i [INPUT_JSON] -s [STR_LABEL] + $ caper run [WDL] -i [INPUT_JSON] ``` - > **WARNING**: If you try to run multiple workflows at the same time then you will see a DB connection error message since multiple Caper instances will try to lock the same DB file `~/.caper/default_file_db`. Use a server-based [MySQL database](DETAILS.md/#mysql-server) instead or disable connection to DB with `--no-file-db` but you will not be able to take advantage of [Cromwell's call-caching](https://cromwell.readthedocs.io/en/develop/Configuring/#call-caching) to re-use outputs from previous workflows. We recomend to use `server` and `submit` for multiple concurrent workflows. + > **WARNING**: If you try to run multiple workflows at the same time then you will see a `db - Connection is not available` error message since multiple Caper instances will try to lock the same DB file `~/.caper/default_file_db`. ```bash - [error] Failed to instantiate Cromwell System. Shutting down Cromwell. java.sql.SQLTransientConnectionException: db - Connection is not available, request timed out after 3000ms. ``` -* `server`: To start a server. + > **WORKAROUND**: Define a different DB file per run with `--file-db`. Or start a caper server and submit multiple workflows to it so that the DB file is taken by one caper server only. Or use a server-based [MySQL database](DETAILS.md/#mysql-server) instead or disable connection to DB with `--no-file-db` or `-n` but you will not be able to use [Cromwell's call-caching](https://cromwell.readthedocs.io/en/develop/Configuring/#call-caching) to re-use outputs from previous workflows. + +* `server`: To start a server. You can define a server port with `--port`. Use a different port for each server for multiple servers. If you don't use a default port (`8000`). Then define `--port` for all client subcommands like `submit`, `list` and `troubleshoot`. If you run a server on a different IP address or hostname, then define it with `--ip` for all client subcomands like `submit`. ```bash $ caper server @@ -83,7 +58,7 @@ troubleshoot | WF_ID, STR_LABEL or
METADATA_JSON_FILE |Analyze reason for err * `submit`: To submit a workflow to a server. Define a string label for submitted workflow with `-s`. It is optional but useful for other subcommands to indentify a workflow. ```bash - $ caper submit [WDL] -i [INPUT_JSON] -s [STR_LABEL] + $ caper submit [WDL] -i [INPUT_JSON] -s [STR_LABEL] ``` * `list`: To show a list of all workflows submitted to a cromwell server. Wildcard search with using `*` and `?` is allowed for such label for the following subcommands with `STR_LABEL`. @@ -103,9 +78,14 @@ troubleshoot | WF_ID, STR_LABEL or
METADATA_JSON_FILE |Analyze reason for err ## Configuration file +Run Caper without parameters to generate a default configuration file. +```bash +$ caper +``` + Caper automatically creates a default configuration file at `~/.caper/default.conf`. Such configruation file comes with all available parameters commented out. You can uncomment/define any parameter to activate it. -You can avoid repeatedly defining same parameters in your command line arguments by using a configuration file. For example, you can define `out-dir` and `tmp-dir` in your configuration file instead of defining them in command line arguments. +You can avoid repeatedly defining same parameters in your command line arguments. For example, you can define `out-dir` and `tmp-dir` in your configuration file instead of defining them in command line arguments. ``` $ caper run [WDL] --out-dir [LOCAL_OUT_DIR] --tmp-dir [LOCAL_TMP_DIR] ``` @@ -118,172 +98,169 @@ out-dir=[LOCAL_OUT_DIR] tmp-dir=[LOCAL_TMP_DIR] ``` -## Initialize it +## Minimum required parameters -Run Caper without parameters to generate a default configuration file. -```bash -$ caper +An auto-generated default configuration has a `Minimum required parameters` section on top. Other parameters in other sections are optional and most users will not be interested in them. If you don't see this section then remove existing default configuration file and regenerate it. + +Edit your configuration file (`~/.caper/default.conf` by default) and uncomment/define parameters for your preferred backend. ``` +[defaults] -## Database +############ Minimum required parameters +## Please read through carefully -Caper defaults to use Cromwell's built-in HyperSQL file database located at `~/.caper/default_file_db`. You can change default database file path prefix in a default configuration file (`~/.caper/default.conf`). Setting up a database is important for Caper to re-use outputs from previous failed/succeeded workflows. -``` -file-db=[YOUR_FILE_DB_PATH_PREFIX] -``` +## Define file DB to use Cromwell's call-caching +## Call-caching is important for restarting failed workflows +## File DB can only be accessed by one caper process (caper run or server) +## i.e. you cannot run multiple caper run with one file DB +## For such case, we recommend to use caper server and submit multiple workflows to it +## You can disable file DB with '--no-file-db' or '-n' +#file-db=~/.caper/default_file_db -You can also use your own MySQL database if you [configure MySQL for Caper](DETAILS.md/#mysql-server). +## Define to use 'caper server' and all client subcommands like 'caper submit' +## This is not required for 'caper run' +#port=8000 -## Singularity +## Define default backend (local, gcp, aws, slurm, sge, pbs) +#backend=local -Caper supports Singularity for its local built-in backend (`local`, `slurm`, `sge` and `pbs`). Tasks in a workflow will run inside a container and outputs will be pulled out to a host from it at the end of each task. Or you can add `--use-singularity` to use a [Singularity image URI defined in your WDL as a comment](DETAILS.md/#wdl-customization). +## Define output directory if you want to run pipelines locally +#out-dir= -```bash -$ caper run [WDL] -i [INPUT_JSON] --singularity [SINGULARITY_IMAGE_URI] +## Define if you want to run pipelines on Google Cloud Platform +#gcp-prj=encode-dcc-1016 +#out-gcs-bucket=gs://encode-pipeline-test-runs/project1/caper_out + +## Define if you want to run pipelines on AWS +#aws-batch-arn=arn:.... +#aws-region=us-west-1 +#out-s3-bucket=s3://encode-pipeline-test-runs/project1/caper_out + +## Define if you want to run pipelines on SLURM +## Define partition or account or both according to your cluster's requirements +## For example, Stanford requires a partition and SCG requires an account. +#slurm-partition=akundaje +#slurm-account=akundaje + +## Define if you want to run pipelines on SGE +#sge-pe=shm + +## Define if your SGE cluster requires a queue +#sge-queue=q + +## Define if your PBS cluster requires a queue +#pbs-queue=q ``` -Define a cache directory where local Singularity images will be built. You can also define an environment variable `SINGULARITY_CACHEDIR`. +> **RECOMMENDATION**: Instead of using a default configuration file at `~/.caper/default.conf`, you can specify your own configuration file with `caper -c`. This is useful when you want to manage a configuration file per project (e.g. use a different file DB `--file-db` per project to prevent locking). ``` -singularity-cachedir=[SINGULARITY_CACHEDIR] +$ caper -c [YOUR_CONF_FILE_FOR_PROJECT_1] ... ``` -Singularity image will be built first before running a workflow to prevent mutiple tasks from competing to write on the same local image file. If you don't define it, every task in a workflow will try to repeatedly build a local Singularity image on their temporary directory. +## Running workflows on GCP/AWS backends +Cloud backends (AWS and GCP) write outputs on corresponding storage buckets (s3 and gcs). Caper internally uses cloud CLIs `gsutil` and `aws`. Therefore, make sure that these CLIs are installed and configured correctly. -## Docker +> **WARNING**: On GCP backend you can deploy a workflow from your local computer. However due to AWS security reasons, you cannot do it on AWS backend. You need to spin up a AWS instance on AWS Console and configure for `aws` on the instance and run Caper there. -Caper supports Docker for its non-HPC backends (`local`, `aws` and `gcp`). +1) Google Cloud Platform (GCP): Install [gsutil](https://cloud.google.com/storage/docs/gsutil_install). [Configure for gcloud and gsutil](docs/conf_gcp.md). -> **WARNING**: AWS and GCP backends will not work without a Docker image URI defined in a WDL file or specified with `--docker`. You can skip adding `--use-docker` since Caper will try to find it in your WDL first. +2) AWS: [Configure for AWS](docs/conf_aws.md) first. -Tasks in a workflow will run inside a container and outputs will be pulled out to a host from it at the end of each task. Or you can add `--use-docker` to use a [Docker image URI defined in your WDL as a comment](DETAILS.md/#wdl-customization). +## Deepcopy (auto inter-storage transfer) -```bash -$ caper run [WDL] -i [INPUT_JSON] --docker [DOCKER_IMAGE_URI] -``` +`--deepcopy` allows Caper to **RECURSIVELY** copy files defined in your input JSON into your target backend's temporary storage. For example, Cromwell cannot read directly from URLs in an [input JSON file](https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/examples/caper/ENCSR356KRQ_subsampled.json), but Caper with `--deepcopy` makes copies of these URLs on your backend's temporary directory (e.g. `--tmp-dir` for `local`, `--tmp-gcs-bucket` for `gcp`) and pass them to Cromwell. -## Conda +## How to manage configuration file per project -Activate your `CONDA_ENV` before running Caper (both for `run` and `server` modes). -```bash -$ conda activate [COND_ENV] +It is useful to have a configuration file per project. For example of two projects. + +We want to run pipelines locally for project-1, run a server with `caper -c project_1.conf server` and submit a workflow with `caper -c project_1.conf submit [WDL] ...` or run a single workflow `caper -c project_1.conf run [WDL] ...`. +``` +[defaults] +file-db=~/.caper/file_db_project_1 +port=8000 +backend=local +out-dir=/scratch/user/caper_out_project_1 ``` +We want to run pipelines on Google Cloud Platform for project-2. Run a server with `caper -c project_2.conf server` and submit a workflow with `caper -c project_2.conf submit [WDL] ...` or run a single workflow `caper -c project_2.conf run [WDL] ...`. +``` +[defaults] +file-db=~/.caper/file_db_project_2 +port=8001 +backend=gcp +gcp-prj=YOUR_GCP_PRJ_NAME +out-gcs-bucket=gs://caper_out_project_2 +``` + +Then you will see no conflict in file DBs and network ports (`8000` vs. `8001`) between two projects. + + ## How to run it According to your chosen backend, define the following parameters in your default configuration file (`~/.caper/default.conf`). * Local ``` - # if you want to run your workflow in a Singularity container - singularity-cachedir=[SINGULARITY_CACHEDIR] - - # directory to store all outputs + backend=local out-dir=[LOCAL_OUT_DIR] - # temporary directory for Caper - # lots of temporary files will be created and stored here - # e.g. backend.conf, workflow_opts.json, input.json, labels.json - # don't use /tmp - tmp-dir=[LOCAL_TMP_DIR] + # server mode + # ip is an IP address or hostname of a Cromwell server + # it's localhost by default but if you are submitting to + # a remote Cromwell server (e.g. from login node to a compute node) + # then take IP address of the server and write it here + ip=localhost + + # port is 8000 by default. but if it's already taken + # then try other ports like 8001 + port=8000 ``` * Google Cloud Platform (GCP): Install [gsutil](https://cloud.google.com/storage/docs/gsutil_install). [Configure for gcloud and gsutil](docs/conf_gcp.md). - ``` - # specify default backend as gcp backend=gcp - - # your project name on Google Cloud platform gcp-prj=YOUR_PRJ_NAME - - # directory to store all outputs out-gcs-bucket=gs://YOUR_OUTPUT_ROOT_BUCKET/ANY/WHERE - - # temporary bucket directory for Caper - tmp-gcs-bucket=gs://YOUR_TEMP_BUCKET/SOME/WHERE ``` -* AWS: Install [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/install-linux.html). [Configure for AWS](docs/conf_aws.md). - +* AWS: [Configure for AWS](docs/conf_aws.md) first. ``` - # specify default backend as aws backend=aws - - # ARN for your AWS Batch aws-batch-arn=ARN_FOR_YOUR_AWS_BATCH - - # directory to store all outputs + aws-region=YOUR_AWS_REGION out-s3-bucket=s3://YOUR_OUTPUT_ROOT_BUCKET/ANY/WHERE - - # temporary bucket directory for Caper - tmp-s3-bucket=s3://YOUR_TEMP_BUCKET/SOME/WHERE ``` * SLURM ``` - # specify default backend as slurm backend=slurm - - # if you want to run your workflow in a Singularity container - singularity-cachedir=[SINGULARITY_CACHEDIR] - - # directory to store all outputs out-dir=[LOCAL_OUT_DIR] - # temporary directory for Caper - # don't use /tmp - tmp-dir=[LOCAL_TMP_DIR] - # SLURM partition if required (e.g. on Stanford Sherlock) slurm-partition=[YOUR_PARTITION] # SLURM account if required (e.g. on Stanford SCG4) slurm-account=[YOUR_ACCOUMT] - # You may not need to specify the above two - # since most SLURM clusters have default rules for partition/account - - # server mode - # ip is an IP address or hostname of a Cromwell server - # it's localhost by default but if you are submitting to - # a remote Cromwell server (e.g. from login node to a compute node) - # then take IP address of the server and write it here ip=localhost - - # port is 8000 by default. but if it's already taken - # then try other ports like 8001 port=8000 ``` * SGE ``` - # specify default backend as sge backend=sge - - # if you want to run your workflow in a Singularity container - singularity-cachedir=[SINGULARITY_CACHEDIR] - - # directory to store all outputs out-dir=[LOCAL_OUT_DIR] - # temporary directory for Caper - # don't use /tmp - tmp-dir=[LOCAL_TMP_DIR] - - # SGE PE + # SGE PE (if you don't have it, ask your admin to create one) sge-pe=[YOUR_PARALLEL_ENVIRONMENT] - # server mode - # ip is an IP address or hostname of a Cromwell server - # it's localhost by default but if you are submitting to - # a remote Cromwell server (e.g. from login node to a compute node) - # then take IP address of the server and write it here - ip=localhost + # SGE queue if required + sge-queue=[YOUR_SGE_QUEUE] - # port is 8000 by default. but if it's already taken - # then try other ports like 8001 + ip=localhost port=8000 ``` From 1d3ba182c15c48553cfdd61123c2bb56349004dd Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Tue, 25 Jun 2019 13:22:55 -0700 Subject: [PATCH 4/4] ver up --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d4fc493e..21640903 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='caper', - version='v0.3.7', + version='v0.3.8', python_requires='>3.4.1', scripts=['bin/caper', 'mysql/run_mysql_server_docker.sh', 'mysql/run_mysql_server_singularity.sh'],