diff --git a/DETAILS.md b/DETAILS.md index 24b80c7d..85a21be9 100644 --- a/DETAILS.md +++ b/DETAILS.md @@ -18,10 +18,14 @@ We highly recommend to use a default configuration file described in the section :-----|:----- --str-label, -s|Caper's special label for a workflow. This will be used to identify a workflow submitted by Caper --docker|Docker image URI for a WDL - --singularity|Singaularity image URI for a WDL + --singularity|Singularity image URI for a WDL --use-docker|Use docker image for all tasks in a workflow by adding docker URI into docker runtime-attribute --use-singularity|Use singularity image for all tasks in a workflow --no-build-singularity|Local singularity image will not be built before running/submitting a workflow + --singularity-cachedir|Singularity image URI for a WDL + --file-db|DB file for Cromwell's built-in HyperSQL database + --no-file-db|Do not use file-db. Call-caching (re-using outputs) will be disabled + * Choose a default backend. Use `--deepcopy` to recursively auto-copy data files in your input JSON file. All data files will be automatically transferred to a target local/remote storage corresponding to a chosen backend. Make sure that you correctly configure temporary directories for source/target storages (`--tmp-dir`, `--tmp-gcs-bucket` and `--tmp-s3-bucket`). diff --git a/README.md b/README.md index bf990515..3d65dc04 100644 --- a/README.md +++ b/README.md @@ -12,13 +12,13 @@ Caper is based on Unix and cloud platform CLIs (`curl`, `gsutil` and `aws`) and * **Built-in backends**: You don't need your own backend configuration file. Caper provides built-in backends. -* **Automatic transfer between local/cloud storages**: You can use URIs (e.g. `gs://`, `http://` and `s3://`) instead of paths in a command line arguments, also in your input JSON file. Files associated with these URIs will be automatically transfered to a specified temporary directory on a target remote storage. +* **Automatic transfer between local/cloud storages**: You can use URIs (e.g. `gs://`, `http(s)://` and `s3://`) instead of paths in a command line arguments, also in your input JSON file. Files associated with these URIs will be automatically transfered to a specified temporary directory on a target remote storage. * **Deepcopy for input JSON file**: Recursively copy all data files in (`.json`, `.tsv` and `.csv`) to a target remote storage. Use `--deepcopy` for this feature. * **Docker/Singularity integration**: You can run a WDL workflow in a specifed docker/singularity container. -* **MySQL database integration**: We provide shell scripts to run a MySQL database server in a docker/singularity container. Using Caper with MySQL database will allow you to use Cromwell's [call-caching](https://cromwell.readthedocs.io/en/develop/Configuring/#call-caching) to re-use outputs from previous successful tasks. This will be useful to resume a failed workflow where it left off. +* **MySQL database integration**: Caper defaults to use Cromwell's built-in HyperSQL DB to store metadata of all workflows. However, we also provide shell scripts to run a MySQL database server in a docker/singularity container. Using Caper with those databases will allow you to use Cromwell's [call-caching](https://cromwell.readthedocs.io/en/develop/Configuring/#call-caching) to re-use outputs from previous successful tasks. This will be useful to resume a failed workflow where it left off. * **One configuration file for all**: You may not want to repeat writing same command line parameters for every pipeline run. Define parameters in a configuration file at `~/.caper/default.conf`. @@ -53,7 +53,7 @@ There are 7 subcommands available for Caper. Except for `run` other subcommands **Subcommand**|**Positional args** | **Description** :--------|:-----|:----- server | |Run a Cromwell server with built-in backends -run | WDL |Run a single workflow +run | WDL |Run a single workflow (not recommened for multiple workflows) submit | WDL |Submit a workflow to a Cromwell server abort | WF_ID or STR_LABEL |Abort submitted workflows on a Cromwell server unhold | WF_ID or STR_LABEL |Release hold of workflows on a Cromwell server @@ -61,33 +61,38 @@ list | WF_ID or STR_LABEL |List submitted workflows on a Cromwell server metadata | WF_ID or STR_LABEL |Retrieve metadata JSONs for workflows troubleshoot | WF_ID, STR_LABEL or
METADATA_JSON_FILE |Analyze reason for errors -Examples: - -* `run`: To run a single workflow. Add `--hold` to put an hold to submitted workflows. +* `run`: To run a single workflow. A string label `-s` is optional and useful for other subcommands to indentify a workflow. ```bash - $ caper run [WDL] -i [INPUT_JSON] + $ caper run [WDL] -i [INPUT_JSON] -s [STR_LABEL] ``` -* `server`: To start a server + > **WARNING**:If you try to run multiple workflows at the same time then you will see a DB connection error message since multiple Caper instances will try to lock the same DB file `~/.caper/default_file_db`. Use a server-based [MySQL database](DETAILS.md/#mysql-server) instead or disable connection to DB with `--no-file-db` but you will not be able to take advantage of [Cromwell's call-caching](https://cromwell.readthedocs.io/en/develop/Configuring/#call-caching) to re-use outputs from previous workflows. We recomend to use `server` and `submit` for multiple concurrent workflows. + + ```bash + [2019-06-06 03:40:00,39] [error] Failed to instantiate Cromwell System. Shutting down Cromwell. + java.sql.SQLTransientConnectionException: db - Connection is not available, request timed out after 3000ms. + ```bash + +* `server`: To start a server. ```bash $ caper server ``` -* `submit`: To submit a workflow to a server. `-s` is optional but useful for other subcommands to find submitted workflow with matching string label. +* `submit`: To submit a workflow to a server. Define a string label for submitted workflow with `-s`. It is optional but useful for other subcommands to indentify a workflow. ```bash $ caper submit [WDL] -i [INPUT_JSON] -s [STR_LABEL] ``` -* `list`: To show list of all workflows submitted to a cromwell server. Wildcard search with using `*` and `?` is allowed for such label for the following subcommands with `STR_LABEL`. +* `list`: To show a list of all workflows submitted to a cromwell server. Wildcard search with using `*` and `?` is allowed for such label for the following subcommands with `STR_LABEL`. ```bash $ caper list [WF_ID or STR_LABEL] ``` -* `troubleshoot`: To analyze reasons for workflow failures. You can specify a metadata JSON file. or workflow IDs and labels. Wildcard search with using `*` and `?` is allowed for string lables. +* `troubleshoot`: To analyze reasons for workflow failures. You can specify failed workflow's metadata JSON file or workflow IDs and labels. Wildcard search with using `*` and `?` is allowed for string labels. ```bash $ caper troubleshoot [WF_ID, STR_LABEL or METADATA_JSON_FILE] @@ -100,7 +105,7 @@ Examples: Caper automatically creates a default configuration file at `~/.caper/default.conf`. Such configruation file comes with all available parameters commented out. You can uncomment/define any parameter to activate it. -You can avoid repeatedly defining same parameters in your command line arguments by using a configuration file. For example, you can define `out_dir` and `tmp_dir` in your configuration file instead of defining them in command line arguments. +You can avoid repeatedly defining same parameters in your command line arguments by using a configuration file. For example, you can define `out-dir` and `tmp-dir` in your configuration file instead of defining them in command line arguments. ``` $ caper run [WDL] --out-dir [LOCAL_OUT_DIR] --tmp-dir [LOCAL_TMP_DIR] ``` @@ -113,18 +118,63 @@ out-dir=[LOCAL_OUT_DIR] tmp-dir=[LOCAL_TMP_DIR] ``` -## Before running it +## Initialize it Run Caper without parameters to generate a default configuration file. - ```bash $ caper ``` +## Database + +Caper defaults to use Cromwell's built-in HyperSQL file database located at `~/.caper/default_file_db`. You can change default database file path prefix in a default configuration file (`~/.caper/default.conf`). Setting up a database is important for Caper to re-use outputs from previous failed/succeeded workflows. +``` +file-db=[YOUR_FILE_DB_PATH_PREFIX] +``` + +You can also use your own MySQL database if you [configure MySQL for Caper](DETAILS.md/#mysql-server). + +## Singularity + +Caper supports Singularity for its local built-in backend (`local`, `slurm`, `sge` and `pbs`). Tasks in a workflow will run inside a container and outputs will be pulled out to a host from it at the end of each task. Or you can add `--use-singularity` to use a [Singularity image URI defined in your WDL as a comment](DETAILS.md/#wdl-customization). + +```bash +$ caper run [WDL] -i [INPUT_JSON] --singularity [SINGULARITY_IMAGE_URI] +``` + +## Docker + +Caper supports Docker for its non-HPC backends (`local`, `aws` and `gcp`). + +> **WARNING**: AWS and GCP backends will not work without a Docker image URI defined in a WDL file or specified with `--docker`. You can skip adding `--use-docker` since Caper will try to find it in your WDL first. + +Tasks in a workflow will run inside a container and outputs will be pulled out to a host from it at the end of each task. Or you can add `--use-docker` to use a [Docker image URI defined in your WDL as a comment](DETAILS.md/#wdl-customization). + +```bash +$ caper run [WDL] -i [INPUT_JSON] --docker [DOCKER_IMAGE_URI] +``` + +Define a cache directory where local Singularity images will be built. You can also define an environment variable `SINGULARITY_CACHEDIR`. +``` +singularity-cachedir=[SINGULARITY_CACHEDIR] +``` + +Singularity image will be built first before running a workflow to prevent mutiple tasks from competing to write on the same local image file. If you don't define it, every task in a workflow will try to repeatedly build a local Singularity image on their temporary directory. + +## Conda + +Activate your `CONDA_ENV` before running Caper (both for `run` and `server` modes). +```bash +$ conda activate [COND_ENV] +``` + ## How to run it on a local computer Define two important parameters in your default configuration file (`~/.caper/default.conf`). ``` +# if you want to run your workflow in a Singularity container +singularity-cachedir=[SINGULARITY_CACHEDIR] + # directory to store all outputs out-dir=[LOCAL_OUT_DIR] @@ -135,11 +185,17 @@ out-dir=[LOCAL_OUT_DIR] tmp-dir=[LOCAL_TMP_DIR] ``` -Run Caper. `--deepcopy` is optional for remote (http://, gs://, s3://, ...) `INPUT_JSON` file. +Run Caper. `--deepcopy` is optional to recursively make local copies of remote files (`http(s)://`, `gs://` and `s3://`) in an `INPUT_JSON` file. ```bash $ caper run [WDL] -i [INPUT_JSON] --deepcopy ``` +Or run a server and submit a workflow to it. +```bash +$ caper server +$ caper submit [WDL] -i [INPUT_JSON] --deepcopy +``` + ## How to run it on Google Cloud Platform (GCP) Install [gsutil](https://cloud.google.com/storage/docs/gsutil_install). [Configure for gcloud and gsutil](docs/conf_gcp.md). @@ -156,11 +212,17 @@ out-gcs-bucket=gs://YOUR_OUTPUT_ROOT_BUCKET/ANY/WHERE tmp-gcs-bucket=gs://YOUR_TEMP_BUCKET/SOME/WHERE ``` -Run Caper. `--deepcopy` is optional for remote (local, http://, s3://, ...) `INPUT_JSON` file. +Run Caper. `--deepcopy` is optional to recursively make GCS copies of remote files (`http(s)://`, `s3://` and local path) in an `INPUT_JSON` file. ```bash $ caper run [WDL] -i [INPUT_JSON] --backend gcp --deepcopy ``` +Or run a server and submit a workflow to it. +```bash +$ caper server +$ caper submit [WDL] -i [INPUT_JSON] --backend gcp --deepcopy +``` + ## How to run it on AWS Install [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/install-linux.html). [Configure for AWS](docs/conf_aws.md). @@ -177,36 +239,36 @@ out-s3-bucket=s3://YOUR_OUTPUT_ROOT_BUCKET/ANY/WHERE tmp-s3-bucket=s3://YOUR_TEMP_BUCKET/SOME/WHERE ``` -Run Caper. `--deepcopy` is optional for remote (http://, gs://, local, ...) `INPUT_JSON` file. +Run Caper. `--deepcopy` is optional to recursively make S3 copies of remote files (`http(s)://`, `gs://` and local path) in an `INPUT_JSON` file. ```bash $ caper run [WDL] -i [INPUT_JSON] --backend aws --deepcopy ``` +Or run a server and submit a workflow to it. +```bash +$ caper server +$ caper submit [WDL] -i [INPUT_JSON] --backend aws --deepcopy +``` ## How to run it on SLURM cluster Define the following important parameters in your default configuration file (`~/.caper/default.conf`). ``` -# for workflows with singularity support. -# local singularity image will be built here -# define it to prevent repeatedly building -# singularity image for every pipeline task +# if you want to run your workflow in a Singularity container singularity-cachedir=[SINGULARITY_CACHEDIR] # directory to store all outputs out-dir=[LOCAL_OUT_DIR] # temporary directory for Caper -# lots of temporary files will be created and stored here -# e.g. backend.conf, workflow_opts.json, input.json, labels.json # don't use /tmp tmp-dir=[LOCAL_TMP_DIR] # SLURM partition if required (e.g. on Stanford Sherlock) -slurm-partition=YOUR_PARTITION +slurm-partition=[YOUR_PARTITION] # SLURM account if required (e.g. on Stanford SCG4) -slurm-account=YOUR_ACCOUMT +slurm-account=[YOUR_ACCOUMT] # You may not need to specify the above two # since most SLURM clusters have default rules for partition/account @@ -223,53 +285,53 @@ ip=localhost port=8000 ``` -Run Caper. `--deepcopy` is optional for remote (http://, gs://, s3://, ...) `INPUT_JSON` file. Make sure to keep your SSH session alive. We don't recommend to run it on a login node. Cromwell is a Java application which is not lightweight. A cluster can kill your workflow. +Run Caper. Make sure to keep your SSH session alive. + +> **WARNING** We don't recommend to run it on a login node. Caper will be killed while building a local Singularity image or deepcopying remote files. Also Cromwell is a Java application which is not lightweight. Reserve an interactive node with `srun` with at least one CPU, 1GB RAM and long enough walltime first. + ```bash -$ caper run [WDL] -i [INPUT_JSON] --backend slurm --deepcopy +$ caper run [WDL] -i [INPUT_JSON] --backend slurm ``` -Or run a Cromwell server with Caper. Make sure to keep server's SSH session alive. We recommend to run a server on a non-login node with at least one CPU, 2GB RAM and long enough walltime. Take IP address of your compute node and update your default configuration file with it. If there is any conflicting port, then change port in your configuration file. +Or run a Cromwell server with Caper. Make sure to keep server's SSH session alive. + +> **WARNING** We recommend to run a server on a non-login node with at least one CPU, 2GB RAM and long enough walltime. Take IP address of your compute node and update your default configuration file with it. If there is any conflicting port, then change port in your configuration file. ```bash -$ hostname # get hostname of a compute/login node -$ caper server +$ hostname # get IP address or hostname of a compute/login node +$ caper server --backend slurm ``` -Then submit a workflow to the server. +Then submit a workflow to the server. A TCP port `-p` are optional if you have changed the default port `8000`. ```bash -$ caper submit [WDL] -i [INPUT_JSON] --deepcopy -p [PORT] +$ caper submit [WDL] -i [INPUT_JSON] --ip [SERVER_HOSTNAME] --port [PORT] ``` -On HPC cluster with Singularity installed, run Caper with a Singularity container if that is [defined inside `WDL`](DETAILS.md/#wdl-customization). +On HPC cluster with Singularity installed, run Caper with a Singularity container if that is [defined inside `WDL`](DETAILS.md/#wdl-customization). For example, ENCODE [ATAC-seq](https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/atac.wdl#L5) and [ChIP-seq](https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/chip.wdl#L5) pipelines. ```bash -$ caper run [WDL] -i [INPUT_JSON] --backend slurm --deepcopy --use-singularity +$ caper run [WDL] -i [INPUT_JSON] --backend slurm --use-singularity ``` Or specify your own Singularity container. ```bash -$ caper run [WDL] -i [INPUT_JSON] --backend slurm --deepcopy --singularity [YOUR_SINGULARITY_IMAGE] +$ caper run [WDL] -i [INPUT_JSON] --backend slurm --singularity [SINGULARITY_IMAGE_URI] ``` ## How to run it on SGE cluster -Define four important parameters in your default configuration file (`~/.caper/default.conf`). +Define the following important parameters in your default configuration file (`~/.caper/default.conf`). ``` -# for workflows with singularity support. -# local singularity image will be built here -# define it to prevent repeatedly building -# singularity image for every pipeline task +# if you want to run your workflow in a Singularity container singularity-cachedir=[SINGULARITY_CACHEDIR] # directory to store all outputs out-dir=[LOCAL_OUT_DIR] # temporary directory for Caper -# lots of temporary files will be created and stored here -# e.g. backend.conf, workflow_opts.json, input.json, labels.json # don't use /tmp tmp-dir=[LOCAL_TMP_DIR] # SGE PE -sge-pe=YOUR_PARALLEL_ENVIRONMENT +sge-pe=[YOUR_PARALLEL_ENVIRONMENT] # server mode # ip is an IP address or hostname of a Cromwell server @@ -282,68 +344,35 @@ ip=localhost # then try other ports like 8001 port=8000 ``` +Run Caper. Make sure to keep your SSH session alive. -Run Caper. `--deepcopy` is optional for remote (http://, gs://, s3://, ...) `INPUT_JSON` file. Make sure to keep your SSH session alive. We don't recommend to run it on a login node. Cromwell is a Java application which is not lightweight. A cluster can kill your workflow. -```bash -$ caper run [WDL] -i [INPUT_JSON] --backend sge --deepcopy -``` +> **WARNING** We don't recommend to run it on a login node. Caper will be killed while building a local Singularity image or deepcopying remote files. Also Cromwell is a Java application which is not lightweight. Reserve an interactive node with `srun` with at least one CPU, 1GB RAM and long enough walltime first. -Or run a Cromwell server with Caper. Make sure to keep server's SSH session alive. We recommend to run a server on a non-login node with at least one CPU, 2GB RAM and long enough walltime. Take IP address of your compute node and update your default configuration file with it. If there is any conflicting port, then change port in your configuration file. ```bash -$ hostname # get hostname of a compute/login node -$ caper server +$ caper run [WDL] -i [INPUT_JSON] --backend sge ``` -Then submit pipelines to the server. +Or run a Cromwell server with Caper. Make sure to keep server's SSH session alive. + +> **WARNING** We recommend to run a server on a non-login node with at least one CPU, 2GB RAM and long enough walltime. Take IP address of your compute node and update your default configuration file with it. If there is any conflicting port, then change port in your configuration file. ```bash -$ caper submit [WDL] -i [INPUT_JSON] --deepcopy -p [PORT] +$ hostname # get IP address or hostname of a compute/login node +$ caper server --backend sge ``` -## How to resume a failed workflow - -You need to set up a [MySQL database server](DETAILS.md/#mysql-server) to use Cromwell's call-caching feature, which allows a failed workflow to start from where it left off. Use the same command line that you used to start a workflow then Caper will automatically skip tasks that are already done successfully. - -Make sure you have Docker or Singularity installed on your system. Singularity does not require super-user privilege to be installed. - -Configure for MySQL DB in a default configuration file `~/.caper/default.conf`. -``` -# MySQL DB port -# try other port if already taken -mysql-db-port=3307 +Then submit a workflow to the server. A TCP port `-p` are optional if you have changed the default port `8000`. +```bash +$ caper submit [WDL] -i [INPUT_JSON] --ip [SERVER_HOSTNAME] --port [PORT] ``` -`DB_DIR` is a directory to be used as a DB storage. Create an empty directory if it's for the first time. `DB_PORT` is a MySQL DB port. If there is any conflict use other ports. - -1) Docker - - ```bash - $ run_mysql_server_docker.sh [DB_DIR] [DB_PORT] - ``` - -2) Singularity - - ```bash - $ run_mysql_server_singularity.sh [DB_DIR] [DB_PORT] - ``` - -## Singularity container - -In order to run workflow's tasks in a Singularity container. Define the following parameter in your default configuration file (`~/.caper/default.conf`). Caper can work without defining this parameter but Singularity will try to pull the image from a remote repo and build it locally everytime for each task. - -> ***WARNING***: To prevent overhead of repeatedly building Singularity image for each pipeline run, define the following parameter. - -``` -# for workflows with singularity support -singularity-cachedir=[SINGULARITY_CACHEDIR] +On HPC cluster with Singularity installed, run Caper with a Singularity container if that is [defined inside `WDL`](DETAILS.md/#wdl-customization). For example, ENCODE [ATAC-seq](https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/atac.wdl#L5) and [ChIP-seq](https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/chip.wdl#L5) pipelines. +```bash +$ caper run [WDL] -i [INPUT_JSON] --backend sge --use-singularity ``` -Or Caper can also read it from an environment variable `SINGULARITY_CACHEDIR`. - -## Conda - -Activate your `CONDA_ENV` before running Caper (both for `run` and `server` modes). +Or specify your own Singularity container. ```bash -$ conda activate [COND_ENV] +$ caper run [WDL] -i [INPUT_JSON] --backend sge --singularity [SINGULARITY_IMAGE_URI] ``` # DETAILS diff --git a/caper/caper.py b/caper/caper.py index 700184b7..44fb4282 100644 --- a/caper/caper.py +++ b/caper/caper.py @@ -29,7 +29,7 @@ from .caper_uri import URI_S3, URI_GCS, URI_LOCAL, \ init_caper_uri, CaperURI from .caper_backend import BACKEND_GCP, BACKEND_AWS, BACKEND_LOCAL, \ - CaperBackendCommon, CaperBackendMySQL, CaperBackendGCP, \ + CaperBackendCommon, CaperBackendDatabase, CaperBackendGCP, \ CaperBackendAWS, CaperBackendLocal, CaperBackendSLURM, \ CaperBackendSGE, CaperBackendPBS @@ -119,10 +119,7 @@ def __init__(self, args): self._sge_extra_param = args.get('sge_extra_param') self._pbs_queue = args.get('pbs_queue') self._pbs_extra_param = args.get('pbs_extra_param') - self._mysql_db_ip = args.get('mysql_db_ip') - self._mysql_db_port = args.get('mysql_db_port') - self._mysql_db_user = args.get('mysql_db_user') - self._mysql_db_password = args.get('mysql_db_password') + self._backend_file = args.get('backend_file') self._wdl = args.get('wdl') self._inputs = args.get('inputs') @@ -134,6 +131,16 @@ def __init__(self, args): self._metadata_output = args.get('metadata_output') self._singularity_cachedir = args.get('singularity_cachedir') + # file DB + self._file_db = args.get('file_db') + self._no_file_db = args.get('no_file_db') + + # MySQL DB + self._mysql_db_ip = args.get('mysql_db_ip') + self._mysql_db_port = args.get('mysql_db_port') + self._mysql_db_user = args.get('mysql_db_user') + self._mysql_db_password = args.get('mysql_db_password') + # backend and default backend self._backend = args.get('backend') if self._backend is None: @@ -526,29 +533,29 @@ def __find_singularity_bindpath(input_json_file): with open(input_json_file, 'r') as fp: input_json = json.loads(fp.read()) - # find dirname of all files - def recurse_dict(d, d_parent=None, d_parent_key=None, - lst=None, lst_idx=None): - result = set() - if isinstance(d, dict): - for k, v in d.items(): - result |= recurse_dict(v, d_parent=d, - d_parent_key=k) - elif isinstance(d, list): - for i, v in enumerate(d): - result |= recurse_dict(v, lst=d, - lst_idx=i) - elif type(d) == str: - assert(d_parent is not None or lst is not None) - c = CaperURI(d) - # local absolute path only - if c.uri_type == URI_LOCAL and c.is_valid_uri(): - dirname, basename = os.path.split(c.get_uri()) - result.add(dirname) - - return result - - all_dirnames = recurse_dict(input_json) + # find dirname of all files + def recurse_dict(d, d_parent=None, d_parent_key=None, + lst=None, lst_idx=None): + result = set() + if isinstance(d, dict): + for k, v in d.items(): + result |= recurse_dict(v, d_parent=d, + d_parent_key=k) + elif isinstance(d, list): + for i, v in enumerate(d): + result |= recurse_dict(v, lst=d, + lst_idx=i) + elif type(d) == str: + assert(d_parent is not None or lst is not None) + c = CaperURI(d) + # local absolute path only + if c.uri_type == URI_LOCAL and c.is_valid_uri(): + dirname, basename = os.path.split(c.get_uri()) + result.add(dirname) + + return result + + all_dirnames = recurse_dict(input_json) # add all (but not too high level<4) parent directories # to all_dirnames. start from self # e.g. /a/b/c/d/e/f/g/h with COMMON_ROOT_SEARCH_LEVEL = 5 @@ -903,16 +910,19 @@ def __get_backend_conf_str(self): extra_param=self._pbs_extra_param, concurrent_job_limit=self._max_concurrent_tasks)) - # MySQL is optional - if self._mysql_db_user is not None \ - and self._mysql_db_password is not None: - merge_dict( - backend_dict, - CaperBackendMySQL( - ip=self._mysql_db_ip, - port=self._mysql_db_port, - user=self._mysql_db_user, - password=self._mysql_db_password)) + # Database + if self._no_file_db is not None and self._no_file_db: + file_db = None + else: + file_db = self._file_db + merge_dict( + backend_dict, + CaperBackendDatabase( + file_db=file_db, + mysql_ip=self._mysql_db_ip, + mysql_port=self._mysql_db_port, + mysql_user=self._mysql_db_user, + mysql_password=self._mysql_db_password)) # set header for conf ("include ...") assert(Caper.BACKEND_CONF_HEADER.endswith('\n')) diff --git a/caper/caper_args.py b/caper/caper_args.py index bdae1182..b3a1a747 100644 --- a/caper/caper_args.py +++ b/caper/caper_args.py @@ -13,6 +13,7 @@ DEFAULT_CAPER_CONF = '~/.caper/default.conf' +DEFAULT_FILE_DB = '~/.caper/default_file_db' DEFAULT_CROMWELL_JAR = 'https://github.com/broadinstitute/cromwell/releases/download/40/cromwell-40.jar' DEFAULT_MYSQL_DB_IP = 'localhost' DEFAULT_MYSQL_DB_PORT = 3306 @@ -81,7 +82,19 @@ #http-user= #http-password= +############# Cromwell's built-in HyperSQL database settings +## DB file prefix path +#file-db=~/.caper/default_file_db + +## disable file-db +## Detach DB from Cromwell +## you can run multiple workflows with 'caper run' command +## but Caper will not be able re-use outputs from previous workflows +#no-file-db=True + ############# MySQL database settings +## both caper run/server modes will attach to MySQL db +## uncomment/define all of the followings to use MySQL database #mysql-db-ip= #mysql-db-port= #mysql-db-user=cromwell @@ -270,6 +283,14 @@ def parse_caper_arguments(): 'environment variable SINGULARITY_CACHEDIR. ' 'Define it to prevent repeatedly building a singularity image ' 'for every pipeline task') + parent_submit.add_argument( + '--file-db', default=DEFAULT_FILE_DB, + help='Default DB file for Cromwell\'s built-in HyperSQL database.') + parent_submit.add_argument( + '--no-file-db', action='store_true', + help='Disable file DB for Cromwell\'s built-in HyperSQL database. ' + 'An in-memory DB will still be available for server mode.') + # run parent_run = argparse.ArgumentParser(add_help=False) parent_run.add_argument( @@ -448,6 +469,10 @@ def parse_caper_arguments(): and isinstance(no_build_singularity, str): args_d['no_build_singularity'] = bool(strtobool(no_build_singularity)) + no_file_db = args_d.get('no_file_db') + if no_file_db is not None and isinstance(no_file_db, str): + args_d['no_file_db'] = bool(strtobool(no_file_db)) + # int string to int max_concurrent_tasks = args_d.get('max_concurrent_tasks') if max_concurrent_tasks is not None \ @@ -470,9 +495,13 @@ def parse_caper_arguments(): if args_d.get('out_s3_bucket'): args_d['tmp_s3_bucket'] = os.path.join(args_d['out_s3_bucket'], 'caper_tmp') - if args_d.get('tmp_gcs_bucket') is None: if args_d.get('out_gcs_bucket'): args_d['tmp_gcs_bucket'] = os.path.join(args_d['out_gcs_bucket'], 'caper_tmp') + file_db = args_d.get('file_db') + if file_db is not None: + file_db = os.path.abspath(os.path.expanduser(file_db)) + args_d['file_db'] = file_db + return args_d diff --git a/caper/caper_backend.py b/caper/caper_backend.py index 421262dc..02ca6dd3 100644 --- a/caper/caper_backend.py +++ b/caper/caper_backend.py @@ -54,8 +54,8 @@ def __init__(self, port=None, disable_call_caching=None, max_concurrent_workflows -class CaperBackendMySQL(dict): - """Common stanzas for MySQL +class CaperBackendDatabase(dict): + """Common stanzas for database """ TEMPLATE = { "database": { @@ -72,15 +72,23 @@ class CaperBackendMySQL(dict): } } - def __init__(self, ip, port, user, password): - super(CaperBackendMySQL, self).__init__( - CaperBackendMySQL.TEMPLATE) - db = self['database']['db'] - db['user'] = user - db['password'] = password - db['url'] = db['url'].replace('localhost:3306', '{ip}:{port}'.format( - ip=ip, port=port)) - + def __init__(self, file_db=None, mysql_ip=None, mysql_port=None, + mysql_user=None, mysql_password=None): + super(CaperBackendDatabase, self).__init__( + CaperBackendDatabase.TEMPLATE) + if mysql_user is not None and mysql_password is not None: + db = self['database']['db'] + db['user'] = mysql_user + db['password'] = mysql_password + db['url'] = db['url'].replace('localhost:3306', '{ip}:{port}'.format( + ip=mysql_ip, port=mysql_port)) + else: + self['database'] = {} + if file_db is not None: + self['database']['db'] = { + 'url': 'jdbc:hsqldb:file:{};shutdown=false;' + 'hsqldb.tx=mvcc'.format(file_db) + } class CaperBackendGCP(dict): """Google Cloud backend diff --git a/setup.py b/setup.py index 2598b457..c6ba657b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='caper', - version='v0.2.2', + version='v0.2.3', python_requires='>3.4.1', scripts=['bin/caper', 'mysql/run_mysql_server_docker.sh', 'mysql/run_mysql_server_singularity.sh'],