Falco 2.0 Release

- Added 2 new analysis modes - Alignment-only and Transcript Assembly - Added Python3 support to all analysis modes (including Quantification analysis mode) and made PySpark use Python3 by default - Added support for using custom AMI when launching EMR cluster (require EMR release >= 5.7)
VCCRI · Sep 17, 2019 · f8aea28 · f8aea28
1 parent 5c1929f
commit f8aea28
Show file tree

Hide file tree

Showing 23 changed files with 2,088 additions and 83 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,5 +1,5 @@
 Falco - Cloud based single-cell RNA-Seq pipeline
-Copyright (c) 2016, Victor Chang Cardiac Research Institute
+Copyright (c) 2016-2019, Victor Chang Cardiac Research Institute
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by

diff --git a/README.md b/README.md
@@ -1,15 +1,15 @@
 <a href="url"><img src="http://bioinformatics.victorchang.edu.au/projects/falco/images/falco_logo.png" align="left" height="100" alt="Falco"></a>
 # _Falco_: A quick and flexible single-cell RNA-seq processing framework on the cloud
 
-Authors: Andrian Yang, Michael Troup, Peijie Lin, Joshua W. K. Ho
+Authors: Andrian Yang, Michael Troup, Peijie Lin, Abhinav Kishore, Benjamin Phipps, Joshua W. K. Ho
 
 Contact: [email protected]
 
-Copyright © 2016, Victor Chang Cardiac Research Institute
+Copyright © 2016-2019, Victor Chang Cardiac Research Institute
 
 ## Synopsis
 _Falco_ is a software bundle that enables bioinformatic analysis of large-scale transcriptomic data by utilising public cloud
-infrastructure.  The framework is suited to single cell RNA feature quantification analysis.
+infrastructure.  The framework currently provide supports for single cell RNA feature quantification, alignment and transcript assembly analyses.
 
 ## Motivation
 Computational analysis in this field has many challenges, including processing large volumes of data in the order of

diff --git a/alignment_job.config b/alignment_job.config
@@ -0,0 +1,19 @@
+[job_config]
+name = FASTQ alignment
+action_on_failure = CONTINUE
+alignment_script = run_pipeline_alignment.py
+alignment_script_s3_location = s3://[YOUR-BUCKET]/scripts
+alignment_script_local_location = source/spark_runner
+upload_alignment_script = True
+
+[spark_config]
+driver_memory = 30g
+executor_memory = 30g
+
+[script_arguments]
+input_location = s3://[YOUR-BUCKET]/...
+output_location = s3://[YOUR-BUCKET]/...
+# Option for aligner tools is STAR or HISAT2
+aligner_tool = STAR
+aligner_extra_args =
+region = us-west-2
diff --git a/assembly_job.config b/assembly_job.config
@@ -0,0 +1,27 @@
+[job_config]
+name = Transcript Assembly
+action_on_failure = CONTINUE
+assembly_script = run_pipeline_assembly3.py
+assembly_script_s3_location = s3://[YOUR-BUCKET]/scripts
+assembly_script_local_location = source/spark_runner
+upload_assembly_script = True
+
+[spark_config]
+driver_memory = 60g
+executor_memory = 30g
+
+[script_arguments]
+input_location = s3://[YOUR-BUCKET]/...
+output_location = s3://[YOUR-BUCKET]/...
+annotation_file =
+enable_tiling = True
+enable_analysis = True
+# Option for aligner tools is STAR or HISAT2
+aligner_tool = STAR
+aligner_extra_args =
+# Option for assembly tools is StringTie or Scallop
+assembler_tool = StringTie
+assembler_use_reference =
+assembler_extra_args =
+assembler_merge_extra_args =
+region = us-west-2
diff --git a/emr_cluster.config b/emr_cluster.config
@@ -1,5 +1,5 @@
 [EMR]
-release_label = emr-4.6.0
+release_label = emr-5.7.0
 name = Falco cluster
 log_uri = s3://[YOUR-BUCKET]/...
 bootstrap_scripts = install_software.sh, copy_reference.sh
@@ -25,3 +25,6 @@ vpc_subnet =
 master_security_group =
 slave_security_group =
 service_access_security_group =
+# If custom AMI ID is specified, it is recommended to remove install_software.sh from thr bootstrap scripts as the custom AMI should already have these installed.
+custom_ami_id =
+ebs_root_volume_size =
diff --git a/launch_cluster.py b/launch_cluster.py
@@ -8,7 +8,7 @@
 global emr_configuration, emr_applications, cluster_config, optional_instance_config
 emr_configuration = "emr_cluster.config"
 emr_applications = ["Hadoop", "Spark", "Ganglia"]
-cluster_config = ""  # ""source/cluster_creator/cluster_config.json"
+cluster_config = "source/cluster_creator/cluster_config.json"
 optional_instance_config = {"vpc_subnet": "Ec2SubnetId",
                             "master_security_group": "EmrManagedMasterSecurityGroup",
                             "slave_security_group": "EmrManagedSlaveSecurityGroup",
@@ -29,6 +29,14 @@ def check_configuration(config):
                                                       "core_instance_type", "core_instance_count"]):
         return False
 
+    release_version = config["EMR"]["release_label"].split("-")[-1].split(".")
+    major_release_version = int(release_version[0])
+    minor_release_version = int(release_version[1])
+    if config["EMR_nodes"].get("custom_ami_id", "").strip() != "" \
+            and not (major_release_version >= 5 and minor_release_version >= 7):
+        print("\033[31mERROR: \033[0mCustom AMI can only be used with EMR release >= 5.7")
+        return False
+
     return True
 
 
@@ -119,6 +127,12 @@ def build_command(config):
     emr_arguments["JobFlowRole"] = config["EMR_nodes"]["instance_profile"]
     emr_arguments["ServiceRole"] = config["EMR_nodes"]["service_role"]
 
+    if "custom_ami_id" in config["EMR_nodes"]:
+        emr_arguments["CustomAmiId"] = config["EMR_nodes"]["custom_ami_id"]
+
+        if "ebs_root_volume_size" in config["EMR_nodes"]:
+            emr_arguments["EbsRootVolumeSize"] = config["EMR_nodes"]["ebs_root_volume_size"]
+
     return emr_arguments
 
 if __name__ == "__main__":

diff --git a/source/ami_creator/create-image.sh b/source/ami_creator/create-image.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+id=$( head -1 ids.txt )
+aws ec2 create-image --instance-id $id \
+    --name "Falco custom AMI" \
+    --description "Custom AMI for Falco framework with tools pre-installed" \
+    --query 'ImageId' > custom_ami_id.txt
diff --git a/source/ami_creator/ec2-ami-spec.json b/source/ami_creator/ec2-ami-spec.json
@@ -0,0 +1,12 @@
+{
+    "ImageId": "ami-6df1e514",
+    "KeyName": "[PRIVATE_KEY]",
+    "InstanceType": "[INSTANCE_TYPE]",
+    "Placement": {
+        "AvailabilityZone": "[AVAILABILITY_ZONE]"
+    },
+    "IamInstanceProfile": {
+        "Arn": "arn:aws:iam::[AWS_ACCOUNT_ID]:instance-profile/EMR_EC2_DefaultRole"
+    },
+    "UserData":""
+}
diff --git a/source/ami_creator/launch-ec2.sh b/source/ami_creator/launch-ec2.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+count=1
+spot_price=5
+launch_spec_json_file=ec2-ami-spec.json
+user_data_file=user-data-ami.sh
+tmp_json_file=tmp.json
+
+#colours; need to use -e option with echo
+#red='\e[0;31m'
+#cyan='\e[0;36m'
+#green='\e[0;32m'
+#yellow='\e[1;33m'
+#purple='\e[0;35m'
+#nc='\e[0m' #no colour
+red=$( tput setaf 1 )
+cyan=$( tput setaf 6 )
+green=$( tput setaf 2 )
+yellow=$( tput setaf 3 )
+purple=$( tput setaf 5 )
+nc=$( tput sgr0 )
+
+#program usage
+usage() {
+    echo -e "${red}program exited due to invalid usage${nc}"
+    echo -e "${yellow}usage:${purple} $0 ${cyan}--instance-type <type> \
+        <count> [[--instance-type <type> <count>]...] \
+        [--user-data <value>] [--dry-run]${nc}"
+    echo -e "${yellow}Example:${nc}"
+    echo -e "$0 --instance-type r3.large 1 --user-data some-file.sh --dry-run"
+    echo -e "${yellow}Valid instance types:${cyan}"
+    echo -e "${nc}"
+    exit 1
+}
+
+#checks latest return status
+#accepts one argument - name of program call
+check_status() {
+    if [ $? -ne 0 ] ; then
+        echo -e "${red}program exited due to unsuccessful excecution: \
+            ${cyan}${1}${nc}"
+        exit 1
+    fi
+}
+
+#checks program usage: assumes 1 parameter: modify as necessary
+#if [ $# -lt 3 ] ; then
+#    usage
+#fi
+
+#function to exit program with message
+exit_msg() {
+    echo -e "${red}Exiting program: ${cyan}${1}${nc}"
+    exit 1
+}
+
+start=`date +%s` #seconds since epoc
+
+# remove old instance id file
+ids_file=ids.txt
+if [[ -f $ids_file ]] ; then
+    rm $ids_file
+fi
+
+# file to hold instance public ip addresses
+ips_file=ips.txt
+if [[ -f $ips_file ]] ; then
+    rm $ips_file
+fi
+
+# file to hold spot instance request ids
+spot_request_ids_file=spot-ids.txt
+if [[ -f $spot_request_ids_file ]] ; then
+    rm $spot_request_ids_file
+fi
+
+# create base64 code for user data
+# NOTE difference in base64 between mac platform & other linux
+# mac base64 uses -b option iso -w option
+op="-w"
+v=$( man base64 | grep '\-w' )
+[ $? -ne 0 ] && op="-b"
+user_data=$( base64 $op 0 $user_data_file )
+check_status "creating user data"
+# update the base64 user-data string in the .json file
+awk -F":" -v user_data=$user_data -v json_file=$tmp_json_file '{
+    if ($1 ~ /UserData/)
+        print $1 ":\"" user_data "\"" > json_file
+    else
+        print $0 > json_file
+} END {
+close (json_file)
+}' $launch_spec_json_file
+check_status "replacing user data"
+mv $tmp_json_file $launch_spec_json_file
+
+#create the instances that were requested
+aws ec2 request-spot-instances \
+    --spot-price $spot_price \
+    --instance-count $count \
+    --type "one-time" \
+    --launch-specification file://$launch_spec_json_file \
+    --output text \
+    --query 'SpotInstanceRequests[*].SpotInstanceRequestId' > \
+        $spot_request_ids_file
+
+if [[ $? -ne 0 ]] ; then
+    exit_msg "spot request command failed"
+fi
+echo -e "${yellow}Waiting for spot requests to be fulfilled${nc}"
+while (true) ; do
+    sleep 10
+    fulfilled=0
+    for request_id in `cat $spot_request_ids_file` ; do
+        # the effect of this look is to count the # of fulfilled
+        # spot requests
+        fulfilled=$((`aws ec2 describe-spot-instance-requests \
+                    --spot-instance-request-ids $request_id \
+                    --output text \
+                    --query "SpotInstanceRequests[*].Status.Code" \
+                    |grep "fulfilled"|wc -l|awk '{print $1}'` + \
+                    $fulfilled))
+    done
+    if [[ $fulfilled -eq $count ]] ; then
+        # record instance_ids
+        for request_id in `cat $spot_request_ids_file` ; do
+            aws ec2 describe-spot-instance-requests \
+                --spot-instance-request-ids $request_id \
+                --output text \
+                --query "SpotInstanceRequests[*].InstanceId" >> \
+                $ids_file
+        done
+        break
+    fi
+done
+echo -e "${green}All spot requests have been fulfilled${nc}"
+all_done=true
+
+#wait a minute for the instances to start running
+sleep 60
+#start an infinite loop to check when instances are running
+while true; do
+    all_done=true
+    #check the run state of each instance id that was created
+    if [[ -f $ips_file ]] ; then
+        rm -f $ips_file
+    fi
+    for id in `cat $ids_file`; do
+        #check the instance reachability status - when "passed"
+        #should be ok to use
+        instance_details_name=`aws ec2 describe-instance-status \
+            --output text \
+            --instance-ids $id --query \
+            'InstanceStatuses[0].InstanceStatus.Details[0].Name'`
+        instance_details_status=`aws ec2 describe-instance-status \
+            --output text \
+            --instance-ids $id --query \
+            'InstanceStatuses[0].InstanceStatus.Details[0].Status'`
+        if ! [[ ("$instance_details_name" == "reachability") &&
+           ("$instance_details_status" == "passed") ]] ; then
+            all_done=false
+            #this instance is not ready
+            break
+        fi
+        ipaddr=`aws ec2 describe-instances --instance-ids --output text $id --query \
+            'Reservations[0].Instances[0].PublicDnsName'`
+        inst_type=`aws ec2 describe-instances --instance-ids --output text $id --query \
+            'Reservations[0].Instances[0].InstanceType'`
+        echo $ipaddr >> $ips_file
+    done
+    if ! $all_done ; then
+        sleep 10
+    else
+        break
+    fi
+done
+
+finish=`date +%s` #seconds since epoc
+echo -e "${yellow}time: ${cyan}$(( $finish - $start ))${nc}"
+cat $ips_file
diff --git a/source/ami_creator/ssh.sh b/source/ami_creator/ssh.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+ip=$( head -1 ips.txt )
+ssh -i [PRIVATE_KEY] ec2-user@$ip