diff --git a/LICENSE b/LICENSE
index 3550843..6151443 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,5 @@
 Falco - Cloud based single-cell RNA-Seq pipeline
-Copyright (c) 2016, Victor Chang Cardiac Research Institute
+Copyright (c) 2016-2019, Victor Chang Cardiac Research Institute
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/README.md b/README.md
index edd05e1..6baf7bb 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,15 @@
 <a href="url"><img src="http://bioinformatics.victorchang.edu.au/projects/falco/images/falco_logo.png" align="left" height="100" alt="Falco"></a>
 # _Falco_: A quick and flexible single-cell RNA-seq processing framework on the cloud
 
-Authors: Andrian Yang, Michael Troup, Peijie Lin, Joshua W. K. Ho
+Authors: Andrian Yang, Michael Troup, Peijie Lin, Abhinav Kishore, Benjamin Phipps, Joshua W. K. Ho
 
 Contact: j.ho@victorchang.edu.au
 
-Copyright © 2016, Victor Chang Cardiac Research Institute
+Copyright © 2016-2019, Victor Chang Cardiac Research Institute
 
 ## Synopsis
 _Falco_ is a software bundle that enables bioinformatic analysis of large-scale transcriptomic data by utilising public cloud
-infrastructure.  The framework is suited to single cell RNA feature quantification analysis.
+infrastructure.  The framework currently provide supports for single cell RNA feature quantification, alignment and transcript assembly analyses.
 
 ## Motivation
 Computational analysis in this field has many challenges, including processing large volumes of data in the order of
diff --git a/alignment_job.config b/alignment_job.config
new file mode 100644
index 0000000..a15b1b3
--- /dev/null
+++ b/alignment_job.config
@@ -0,0 +1,19 @@
+[job_config]
+name = FASTQ alignment
+action_on_failure = CONTINUE
+alignment_script = run_pipeline_alignment.py
+alignment_script_s3_location = s3://[YOUR-BUCKET]/scripts
+alignment_script_local_location = source/spark_runner
+upload_alignment_script = True
+
+[spark_config]
+driver_memory = 30g
+executor_memory = 30g
+
+[script_arguments]
+input_location = s3://[YOUR-BUCKET]/...
+output_location = s3://[YOUR-BUCKET]/...
+# Option for aligner tools is STAR or HISAT2
+aligner_tool = STAR
+aligner_extra_args =
+region = us-west-2
\ No newline at end of file
diff --git a/assembly_job.config b/assembly_job.config
new file mode 100644
index 0000000..00e4ebe
--- /dev/null
+++ b/assembly_job.config
@@ -0,0 +1,27 @@
+[job_config]
+name = Transcript Assembly
+action_on_failure = CONTINUE
+assembly_script = run_pipeline_assembly3.py
+assembly_script_s3_location = s3://[YOUR-BUCKET]/scripts
+assembly_script_local_location = source/spark_runner
+upload_assembly_script = True
+
+[spark_config]
+driver_memory = 60g
+executor_memory = 30g
+
+[script_arguments]
+input_location = s3://[YOUR-BUCKET]/...
+output_location = s3://[YOUR-BUCKET]/...
+annotation_file =
+enable_tiling = True
+enable_analysis = True
+# Option for aligner tools is STAR or HISAT2
+aligner_tool = STAR
+aligner_extra_args =
+# Option for assembly tools is StringTie or Scallop
+assembler_tool = StringTie
+assembler_use_reference =
+assembler_extra_args =
+assembler_merge_extra_args =
+region = us-west-2
diff --git a/emr_cluster.config b/emr_cluster.config
index 21c5f88..2180423 100644
--- a/emr_cluster.config
+++ b/emr_cluster.config
@@ -1,5 +1,5 @@
 [EMR]
-release_label = emr-4.6.0
+release_label = emr-5.7.0
 name = Falco cluster
 log_uri = s3://[YOUR-BUCKET]/...
 bootstrap_scripts = install_software.sh, copy_reference.sh
@@ -25,3 +25,6 @@ vpc_subnet =
 master_security_group =
 slave_security_group =
 service_access_security_group =
+# If custom AMI ID is specified, it is recommended to remove install_software.sh from thr bootstrap scripts as the custom AMI should already have these installed.
+custom_ami_id =
+ebs_root_volume_size =
\ No newline at end of file
diff --git a/launch_cluster.py b/launch_cluster.py
index 85b31cf..fe3ea5d 100644
--- a/launch_cluster.py
+++ b/launch_cluster.py
@@ -8,7 +8,7 @@
 global emr_configuration, emr_applications, cluster_config, optional_instance_config
 emr_configuration = "emr_cluster.config"
 emr_applications = ["Hadoop", "Spark", "Ganglia"]
-cluster_config = ""  # ""source/cluster_creator/cluster_config.json"
+cluster_config = "source/cluster_creator/cluster_config.json"
 optional_instance_config = {"vpc_subnet": "Ec2SubnetId",
                             "master_security_group": "EmrManagedMasterSecurityGroup",
                             "slave_security_group": "EmrManagedSlaveSecurityGroup",
@@ -29,6 +29,14 @@ def check_configuration(config):
                                                       "core_instance_type", "core_instance_count"]):
         return False
 
+    release_version = config["EMR"]["release_label"].split("-")[-1].split(".")
+    major_release_version = int(release_version[0])
+    minor_release_version = int(release_version[1])
+    if config["EMR_nodes"].get("custom_ami_id", "").strip() != "" \
+            and not (major_release_version >= 5 and minor_release_version >= 7):
+        print("\033[31mERROR: \033[0mCustom AMI can only be used with EMR release >= 5.7")
+        return False
+
     return True
 
 
@@ -119,6 +127,12 @@ def build_command(config):
     emr_arguments["JobFlowRole"] = config["EMR_nodes"]["instance_profile"]
     emr_arguments["ServiceRole"] = config["EMR_nodes"]["service_role"]
 
+    if "custom_ami_id" in config["EMR_nodes"]:
+        emr_arguments["CustomAmiId"] = config["EMR_nodes"]["custom_ami_id"]
+
+        if "ebs_root_volume_size" in config["EMR_nodes"]:
+            emr_arguments["EbsRootVolumeSize"] = config["EMR_nodes"]["ebs_root_volume_size"]
+
     return emr_arguments
 
 if __name__ == "__main__":
diff --git a/source/ami_creator/create-image.sh b/source/ami_creator/create-image.sh
new file mode 100755
index 0000000..36c3137
--- /dev/null
+++ b/source/ami_creator/create-image.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+id=$( head -1 ids.txt )
+aws ec2 create-image --instance-id $id \
+    --name "Falco custom AMI" \
+    --description "Custom AMI for Falco framework with tools pre-installed" \
+    --query 'ImageId' > custom_ami_id.txt
diff --git a/source/ami_creator/ec2-ami-spec.json b/source/ami_creator/ec2-ami-spec.json
new file mode 100644
index 0000000..994c1c7
--- /dev/null
+++ b/source/ami_creator/ec2-ami-spec.json
@@ -0,0 +1,12 @@
+{
+    "ImageId": "ami-6df1e514",
+    "KeyName": "[PRIVATE_KEY]",
+    "InstanceType": "[INSTANCE_TYPE]",
+    "Placement": {
+        "AvailabilityZone": "[AVAILABILITY_ZONE]"
+    },
+    "IamInstanceProfile": {
+        "Arn": "arn:aws:iam::[AWS_ACCOUNT_ID]:instance-profile/EMR_EC2_DefaultRole"
+    },
+    "UserData":""
+}
diff --git a/source/ami_creator/launch-ec2.sh b/source/ami_creator/launch-ec2.sh
new file mode 100755
index 0000000..86543da
--- /dev/null
+++ b/source/ami_creator/launch-ec2.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+count=1
+spot_price=5
+launch_spec_json_file=ec2-ami-spec.json
+user_data_file=user-data-ami.sh
+tmp_json_file=tmp.json
+
+#colours; need to use -e option with echo
+#red='\e[0;31m'
+#cyan='\e[0;36m'
+#green='\e[0;32m'
+#yellow='\e[1;33m'
+#purple='\e[0;35m'
+#nc='\e[0m' #no colour
+red=$( tput setaf 1 )
+cyan=$( tput setaf 6 )
+green=$( tput setaf 2 )
+yellow=$( tput setaf 3 )
+purple=$( tput setaf 5 )
+nc=$( tput sgr0 )
+
+#program usage
+usage() {
+    echo -e "${red}program exited due to invalid usage${nc}"
+    echo -e "${yellow}usage:${purple} $0 ${cyan}--instance-type <type> \
+        <count> [[--instance-type <type> <count>]...] \
+        [--user-data <value>] [--dry-run]${nc}"
+    echo -e "${yellow}Example:${nc}"
+    echo -e "$0 --instance-type r3.large 1 --user-data some-file.sh --dry-run"
+    echo -e "${yellow}Valid instance types:${cyan}"
+    echo -e "${nc}"
+    exit 1
+}
+
+#checks latest return status
+#accepts one argument - name of program call
+check_status() {
+    if [ $? -ne 0 ] ; then
+        echo -e "${red}program exited due to unsuccessful excecution: \
+            ${cyan}${1}${nc}"
+        exit 1
+    fi
+}
+
+#checks program usage: assumes 1 parameter: modify as necessary
+#if [ $# -lt 3 ] ; then
+#    usage
+#fi
+
+#function to exit program with message
+exit_msg() {
+    echo -e "${red}Exiting program: ${cyan}${1}${nc}"
+    exit 1
+}
+
+start=`date +%s` #seconds since epoc
+
+# remove old instance id file
+ids_file=ids.txt
+if [[ -f $ids_file ]] ; then
+    rm $ids_file
+fi
+
+# file to hold instance public ip addresses
+ips_file=ips.txt
+if [[ -f $ips_file ]] ; then
+    rm $ips_file
+fi
+
+# file to hold spot instance request ids
+spot_request_ids_file=spot-ids.txt
+if [[ -f $spot_request_ids_file ]] ; then
+    rm $spot_request_ids_file
+fi
+
+# create base64 code for user data
+# NOTE difference in base64 between mac platform & other linux
+# mac base64 uses -b option iso -w option
+op="-w"
+v=$( man base64 | grep '\-w' )
+[ $? -ne 0 ] && op="-b"
+user_data=$( base64 $op 0 $user_data_file )
+check_status "creating user data"
+# update the base64 user-data string in the .json file
+awk -F":" -v user_data=$user_data -v json_file=$tmp_json_file '{
+    if ($1 ~ /UserData/)
+        print $1 ":\"" user_data "\"" > json_file
+    else
+        print $0 > json_file
+} END {
+close (json_file)
+}' $launch_spec_json_file
+check_status "replacing user data"
+mv $tmp_json_file $launch_spec_json_file
+
+#create the instances that were requested
+aws ec2 request-spot-instances \
+    --spot-price $spot_price \
+    --instance-count $count \
+    --type "one-time" \
+    --launch-specification file://$launch_spec_json_file \
+    --output text \
+    --query 'SpotInstanceRequests[*].SpotInstanceRequestId' > \
+        $spot_request_ids_file
+
+if [[ $? -ne 0 ]] ; then
+    exit_msg "spot request command failed"
+fi
+echo -e "${yellow}Waiting for spot requests to be fulfilled${nc}"
+while (true) ; do
+    sleep 10
+    fulfilled=0
+    for request_id in `cat $spot_request_ids_file` ; do
+        # the effect of this look is to count the # of fulfilled
+        # spot requests
+        fulfilled=$((`aws ec2 describe-spot-instance-requests \
+                    --spot-instance-request-ids $request_id \
+                    --output text \
+                    --query "SpotInstanceRequests[*].Status.Code" \
+                    |grep "fulfilled"|wc -l|awk '{print $1}'` + \
+                    $fulfilled))
+    done
+    if [[ $fulfilled -eq $count ]] ; then
+        # record instance_ids
+        for request_id in `cat $spot_request_ids_file` ; do
+            aws ec2 describe-spot-instance-requests \
+                --spot-instance-request-ids $request_id \
+                --output text \
+                --query "SpotInstanceRequests[*].InstanceId" >> \
+                $ids_file
+        done
+        break
+    fi
+done
+echo -e "${green}All spot requests have been fulfilled${nc}"
+all_done=true
+
+#wait a minute for the instances to start running
+sleep 60
+#start an infinite loop to check when instances are running
+while true; do
+    all_done=true
+    #check the run state of each instance id that was created
+    if [[ -f $ips_file ]] ; then
+        rm -f $ips_file
+    fi
+    for id in `cat $ids_file`; do
+        #check the instance reachability status - when "passed"
+        #should be ok to use
+        instance_details_name=`aws ec2 describe-instance-status \
+            --output text \
+            --instance-ids $id --query \
+            'InstanceStatuses[0].InstanceStatus.Details[0].Name'`
+        instance_details_status=`aws ec2 describe-instance-status \
+            --output text \
+            --instance-ids $id --query \
+            'InstanceStatuses[0].InstanceStatus.Details[0].Status'`
+        if ! [[ ("$instance_details_name" == "reachability") &&
+           ("$instance_details_status" == "passed") ]] ; then
+            all_done=false
+            #this instance is not ready
+            break
+        fi
+        ipaddr=`aws ec2 describe-instances --instance-ids --output text $id --query \
+            'Reservations[0].Instances[0].PublicDnsName'`
+        inst_type=`aws ec2 describe-instances --instance-ids --output text $id --query \
+            'Reservations[0].Instances[0].InstanceType'`
+        echo $ipaddr >> $ips_file
+    done
+    if ! $all_done ; then
+        sleep 10
+    else
+        break
+    fi
+done
+
+finish=`date +%s` #seconds since epoc
+echo -e "${yellow}time: ${cyan}$(( $finish - $start ))${nc}"
+cat $ips_file
diff --git a/source/ami_creator/ssh.sh b/source/ami_creator/ssh.sh
new file mode 100755
index 0000000..50e2898
--- /dev/null
+++ b/source/ami_creator/ssh.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+ip=$( head -1 ips.txt )
+ssh -i [PRIVATE_KEY] ec2-user@$ip
diff --git a/source/ami_creator/user-data-ami.sh b/source/ami_creator/user-data-ami.sh
new file mode 100755
index 0000000..933e088
--- /dev/null
+++ b/source/ami_creator/user-data-ami.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+###############################################################################
+# designed for use with aws instance with ssd (called from --user-data option #
+# mounts ssd to /ssd & creates directories for DNA variant calling pipeline   #
+#                                                                             #
+###############################################################################
+
+s3_software_install=s3://[YOUR-BUCKET]/...
+aws_region=us-west-2
+
+create_dir() {
+    dir=$1
+    # allow for case where image already has dir
+    sudo mkdir -p $dir
+    check_status "mkdir -p $dir"
+    sudo chmod a+rwx $dir
+    check_status "chmod $dir"
+    sudo chgrp ec2-user $dir
+    check_status "chgrp $dir"
+    sudo chown ec2-user $dir
+    check_status "chown $dir"
+}
+
+function unzip_files() {
+    # unzip any .gz files in current directory or any subdirectories
+    # determine if there are any .gz files; note that without this test, the xargs command would fail with a null input
+    zip_files=$( find -L . -name "*.gz" -print0 )
+    if [ "$zip_files" != "" ] ; then
+        # unzip all the .gz files using as many processors as possible
+        find -L . -name "*.gz" -print0 | xargs -P0 -0 gunzip
+    fi
+}
+
+#copy data to newly mounted drive
+mount_dir=/
+create_dir $mount_dir/app
+
+set -e
+set -o pipefail
+
+# update system software
+sudo yum update -y --skip-broken
+
+pushd /app > /dev/null
+
+# copy install software
+aws s3 cp $s3_software_install . --recursive --region=$aws_region
+
+# give rights to ec2-user
+for f in * ; do
+    sudo chgrp ec2-user $f
+    sudo chown ec2-user $f
+done
+
+# Install STAR and its' dependencies
+sudo yum install make gcc-c++ glibc-static -y
+
+# STAR
+tar -xzf STAR*.tar.gz
+star_path=$( find . -name "STAR"|grep -E "/Linux_x86_64/" )
+# symbolic link to the STAR directory (rather than to the executable itself)
+ln -s ${star_path%STAR} STAR
+
+sudo yum install python-devel numpy python-matplotlib -y
+
+# Install subread (featureCount)
+tar -xzf subread*.tar.gz
+fc=$( find -name "featureCounts"|grep bin )
+sr_path=${fc%featureCounts}
+ln -s $sr_path subread
+
+# Install HISAT2
+unzip hisat2*.zip
+hisat_dir=$( find . -maxdepth 1 -type d -name "hisat2*")
+ln -s $hisat_dir hisat
+
+# Install HTSeq
+sudo yum install python27-devel python27-numpy python27-matplotlib python27-Cython -y
+sudo pip install pysam
+sudo pip install htseq
+
+# Install samtools
+sudo yum install zlib-devel ncurses-devel ncurses bzip2-devel xz-devel -y
+tar -xjf samtools*.tar.bz2
+sam_dir=$( find . -maxdepth 1 -type d -name "samtools*" )
+pushd $sam_dir > /dev/null
+make
+sudo make install
+popd > /dev/null
+ln -s $sam_dir samtools
+
+# Install htslib
+hts_dir=$( find $sam_dir -maxdepth 1 -type d -name "htslib-*" )
+pushd $hts_dir > /dev/null
+make
+sudo make install
+popd > /dev/null
+
+# Install picard_tools
+# Note: latest version of picard_tools come as a jar file. We do not need to do anything.
+mkdir picard-tool
+mv picard.jar picard-tool/
+
+# Install stringtie
+tar -xzf stringtie*.tar.gz
+stringtie_dir=$( find . -maxdepth 1 -type d -name "stringtie*")
+ln -s $stringtie_dir stringtie
+
+# Install scallop
+tar -xzf scallop-*_linux_x86_64.tar.gz
+scallop_dir=$( find . -maxdepth 1 -type d -name "scallop*" )
+ln -s $scallop_dir scallop
+
+# Install gffcompare
+tar -xzf gffcompare*.tar.gz
+gffcompare_dir=$( find . -maxdepth 1 -type d -name "gffcompare*")
+ln -s $gffcompare_dir gffcompare
+
+# INSTALL CUSTOM PRE-PROCESSING TOOLS BELOW
+
+# trim galore
+tg=trim_galore
+unzip trim_galore*.zip
+tg_path=$( find . -name $tg )
+ln -s $tg_path $tg
+
+# trimmomatic
+unzip Trimmomatic*.zip
+tm=$( find . -name trimmomatic*.jar )
+ln -s $tm ${tm##*/}
+# hardcoded
+ln -s Trimmomatic-0.36/adapters/NexteraPE-PE.fa NexteraPE-PE.fa
+
+# prinseq
+ps=prinseq-lite.pl
+tar -xzf prinseq-lite*.tar.gz
+ps_path=$( find . -name "$ps" )
+ln -s $ps_path $ps
+
+# install cutadapt
+sudo pip install cutadapt
+
+# -------------------------------------------------------------
+# no longer in /app
+popd > /dev/null
+
+mkdir /mnt/output
+
+# Install python dependencies for framework
+sudo yum install python35 -y
+sudo pip install pandas boto3 ascii_graph pysam
+sudo python3 -m pip install pandas boto3 ascii_graph pysam
+
+# Install java8
+sudo yum install java-1.8.0-openjdk.x86_64 -y
+
+# install htop
+sudo yum install htop -y
+
+
diff --git a/source/cluster_creator/cluster_config.json b/source/cluster_creator/cluster_config.json
new file mode 100644
index 0000000..ad766fe
--- /dev/null
+++ b/source/cluster_creator/cluster_config.json
@@ -0,0 +1,13 @@
+[
+  {
+    "Classification": "spark-env",
+    "Configurations": [
+        {
+          "Classification": "export",
+          "Properties": {
+            "PYSPARK_PYTHON": "/usr/bin/python3"
+          }
+        }
+      ]
+  }
+]
\ No newline at end of file
diff --git a/source/cluster_creator/copy_reference.sh b/source/cluster_creator/copy_reference.sh
index 38a3388..160db1d 100755
--- a/source/cluster_creator/copy_reference.sh
+++ b/source/cluster_creator/copy_reference.sh
@@ -21,3 +21,4 @@ if [ "$zip_files" != "" ] ; then
 fi
 popd
 
+mkdir /mnt/output
diff --git a/source/cluster_creator/install_software.sh b/source/cluster_creator/install_software.sh
index 4a54375..a15c754 100644
--- a/source/cluster_creator/install_software.sh
+++ b/source/cluster_creator/install_software.sh
@@ -6,17 +6,16 @@
 set -e
 set -o pipefail
 
-sudo yum update -y
+#sudo yum update -y --skip-broken
 
-mkdir /mnt/app
-pushd /mnt/app > /dev/null
+sudo mkdir /app
+sudo chown hadoop /app # We need this as making directory in / will require sudo usage and thus the owner won't be hadoop anymore
+pushd /app > /dev/null
 
 aws s3 cp $1 . --recursive
 
 # Install STAR and its' dependencies
-sudo yum install make -y
-sudo yum install gcc-c++ -y
-sudo yum install glibc-static -y
+sudo yum install make gcc-c++ glibc-static -y
 
 # STAR
 tar -xzf STAR*.tar.gz
@@ -24,8 +23,6 @@ star_path=$( find . -name "STAR"|grep -E "/Linux_x86_64/" )
 # symbolic link to the STAR directory (rather than to the executable itself)
 ln -s ${star_path%STAR} STAR
 
-sudo yum install python-devel numpy python-matplotlib -y
-
 # Install subread (featureCount)
 tar -xzf subread*.tar.gz
 fc=$( find -name "featureCounts"|grep bin )
@@ -38,11 +35,11 @@ hisat_dir=$( find . -maxdepth 1 -type d -name "hisat2*")
 ln -s $hisat_dir hisat
 
 # Install HTSeq
-sudo yum install python27-devel python27-numpy python27-matplotlib -y
 sudo pip install pysam
 sudo pip install htseq
 
 # Install samtools
+sudo yum install zlib-devel ncurses-devel ncurses bzip2-devel xz-devel -y --skip-broken
 tar -xjf samtools*.tar.bz2
 sam_dir=$( find . -maxdepth 1 -type d -name "samtools*" )
 pushd $sam_dir > /dev/null
@@ -56,14 +53,29 @@ hts_dir=$( find $sam_dir -maxdepth 1 -type d -name "htslib-*" )
 pushd $hts_dir > /dev/null
 make
 sudo make install
-
 popd > /dev/null
 
 # Install picard_tools
-unzip picard-tools*.zip
-pic_jar=$( find . -name picard.jar )
-pic_path=${pic_jar%picard.jar}
-ln -s $pic_path picard-tools
+# Note: latest version of picard_tools come as a jar file. We do not need to do anything.
+mkdir picard-tool
+mv picard.jar picard-tool/
+
+# Install stringtie
+tar -xzf stringtie*.tar.gz
+stringtie_dir=$( find . -maxdepth 1 -type d -name "stringtie*")
+ln -s $stringtie_dir stringtie
+
+# Install scallop
+tar -xzf scallop-*_linux_x86_64.tar.gz
+scallop_dir=$( find . -maxdepth 1 -type d -name "scallop*" )
+ln -s $scallop_dir scallop
+
+# Install gffcompare
+tar -xzf gffcompare*.tar.gz
+gffcompare_dir=$( find . -maxdepth 1 -type d -name "gffcompare*")
+ln -s $gffcompare_dir gffcompare
+
+# INSTALL CUSTOM PRE-PROCESSING TOOLS BELOW
 
 # trim galore
 tg=trim_galore
@@ -82,23 +94,16 @@ tar -xzf prinseq-lite*.tar.gz
 ps_path=$( find . -name "$ps" )
 ln -s $ps_path $ps
 
+# install cutadapt
+sudo pip install cutadapt
+
 # -------------------------------------------------------------
 # no longer in /mnt/app
 popd > /dev/null
 
-mkdir /mnt/output
-
 # Install python dependencies for framework
-sudo yum install python27-Cython -y
-sudo pip install pandas
-sudo pip install boto3
-sudo python3 -m pip install boto3
-
-# Install java8
-sudo yum install java-1.8.0-openjdk.x86_64 -y
-
-# install cutadapt
-sudo pip install cutadapt
+sudo pip install pandas boto3 ascii_graph pysam
+sudo python3 -m pip install pandas boto3 ascii_graph pysam
 
 # install htop
 sudo yum install htop -y
diff --git a/source/cluster_creator/prepare_install_files.sh b/source/cluster_creator/prepare_install_files.sh
index 99d99de..b68cdc8 100755
--- a/source/cluster_creator/prepare_install_files.sh
+++ b/source/cluster_creator/prepare_install_files.sh
@@ -20,25 +20,34 @@ mkdir $tmp
 cd $tmp
 
 # STAR
-wget -O STAR-2.5.2a.tar.gz https://github.com/alexdobin/STAR/archive/2.5.2a.tar.gz
+wget -O STAR-2.5.4b.tar.gz https://github.com/alexdobin/STAR/archive/2.5.4b.tar.gz
 
 # HISAT2
-wget -O hisat2-2.0.4.zip ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.0.4-Linux_x86_64.zip
+wget -O hisat2-2.1.0.zip ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.1.0-Linux_x86_64.zip
 
-# subread 
-wget -O subread-1.5.0-p3-Linux-x86_64.tar.gz https://sourceforge.net/projects/subread/files/subread-1.5.0-p3/subread-1.5.0-p3-Linux-x86_64.tar.gz/download
+# subread
+wget -O subread-1.6.0-Linux-x86_64.tar.gz https://sourceforge.net/projects/subread/files/subread-1.6.0/subread-1.6.0-Linux-x86_64.tar.gz/download
+
+# stringtie
+wget -O stringtie-1.3.3b.tar.gz http://ccb.jhu.edu/software/stringtie/dl/stringtie-1.3.3b.Linux_x86_64.tar.gz
+
+# Scallop
+wget https://github.com/Kingsford-Group/scallop/releases/download/v0.10.2/scallop-0.10.2_linux_x86_64.tar.gz
+
+# gffcompare
+wget -O gffcompare.tar.gz http://ccb.jhu.edu/software/stringtie/dl/gffcompare-0.10.1.Linux_x86_64.tar.gz
 
 # picard tools
-wget https://github.com/broadinstitute/picard/releases/download/2.4.1/picard-tools-2.4.1.zip
+wget -O picard.jar https://github.com/broadinstitute/picard/releases/download/2.17.10/picard.jar
 
 # sam tools
-wget -O samtools-1.3.1.tar.bz2 https://sourceforge.net/projects/samtools/files/samtools/1.3.1/samtools-1.3.1.tar.bz2/download
+wget -O samtools-1.7.tar.bz2 https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2
 
 # prinseq
 wget -O prinseq-lite-0.20.4.tar.gz https://sourceforge.net/projects/prinseq/files/standalone/prinseq-lite-0.20.4.tar.gz/download
 
 # trim galore
-wget http://www.bioinformatics.babraham.ac.uk/projects/trim_galore/trim_galore_v0.4.1.zip
+wget -O trim_galore_v0.4.5.zip https://github.com/FelixKrueger/TrimGalore/archive/0.4.5.zip
 
 # trimmomatic
 wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.36.zip
diff --git a/source/preprocessing/user_prinseq_cutadapt.sh b/source/preprocessing/user_prinseq_cutadapt.sh
index dc76622..d8bad45 100755
--- a/source/preprocessing/user_prinseq_cutadapt.sh
+++ b/source/preprocessing/user_prinseq_cutadapt.sh
@@ -9,8 +9,8 @@
 
 # this file includes example code that could be used for paired-end data
 
-prinseq=/mnt/app/prinseq-lite.pl
-trim_galore=/mnt/app/trim_galore
+prinseq=/app/prinseq-lite.pl
+trim_galore=/app/trim_galore
 
 # function to deal with terminal errors
 exit_msg() {
diff --git a/source/preprocessing/user_trimmomatic.sh b/source/preprocessing/user_trimmomatic.sh
index fc21250..a295294 100755
--- a/source/preprocessing/user_trimmomatic.sh
+++ b/source/preprocessing/user_trimmomatic.sh
@@ -24,7 +24,7 @@ fq_2="$2"
 [ -f $fq_1 ] || exit_msg "Unable to locate read 1 file: $fq_1"
 [ -f $fq_2 ] || exit_msg "Unable to locate read 1 file: $fq_2"
         
-trimmomatic="/mnt/app/trimmomatic-0.36.jar"
+trimmomatic="/app/trimmomatic-0.36.jar"
 fa="NexteraPE-PE.fa"
 [ -f $trimmomatic ] || exit_msg "Couldn't find trimmomatic: $trimmomatic"
 [ -f $fa ] || exit_msg "Couldn't find fa file: $fa"
diff --git a/source/spark_runner/run_pipeline_alignment.py b/source/spark_runner/run_pipeline_alignment.py
new file mode 100644
index 0000000..c362e58
--- /dev/null
+++ b/source/spark_runner/run_pipeline_alignment.py
@@ -0,0 +1,358 @@
+import argparse
+import sys
+from operator import add
+import os
+import shlex
+import shutil
+from subprocess import Popen, PIPE
+from pyspark import SparkContext, SparkConf
+import pyspark.serializers
+import subprocess
+import boto3
+import re
+
+global parser_result
+
+if sys.version > "3.4":
+    pyspark.serializers.protocol = 4
+
+APPLICATION_FOLDER = "/app"
+GENOME_REFERENCES_FOLDER = "/mnt/ref"
+TEMP_OUTPUT_FOLDER = "/mnt/output"
+HDFS_TEMP_OUTPUT_FOLDER = "/tmp/sam_chunks"
+
+
+#################################
+#  File splitting
+#################################
+
+
+def split_interleaved_file(file_prefix, file_content, output_dir):
+    """
+    Unpacks an interleaved file into the standard FASTQ format
+    :param file_prefix: the prefix of the file name
+    :param file_content: the lines of content from the input file
+    :param output_dir: the location to store the unpacked files
+    :return: a tuple with first element being a list of output file names
+    (1 for se, 2 for pe); 2nd element a boolean flag - True if pe data,
+    False otherwise
+    """
+    fastq_line_count_se = 4
+    fastq_line_count_pe = 8
+    paired_reads = False
+    output_file_names = []
+
+    file_prefix = output_dir + "/" + file_prefix
+    output_file = file_prefix + "_1.fq"
+    output_file_names.append(output_file)
+    output_file_writer = open(output_file, 'w')
+
+    count = 0
+    for line in file_content.strip().split("\n"):
+        # In the first line, check if it's paired or not
+        if count == 0 and len(line.strip().split("\t")) == fastq_line_count_pe:
+            paired_reads = True
+            output_file_pair = file_prefix + "_2.fq"
+            output_file_names.append(output_file_pair)
+            output_pair_writer = open(output_file_pair, 'w')
+
+        if paired_reads:
+            parts = line.strip().split("\t")
+
+            if len(parts) != fastq_line_count_pe:
+                continue
+
+            read_one = parts[:fastq_line_count_se]
+            read_two = parts[fastq_line_count_se:]
+            output_file_writer.write("\n".join(read_one) + "\n")
+            output_pair_writer.write("\n".join(read_two) + "\n")
+        else:
+            output_file_writer.writelines(line.strip().replace("\t", "\n") + "\n")
+
+        count += 1
+
+    output_file_writer.close()
+    if paired_reads:
+        output_pair_writer.close()
+
+    return output_file_names, paired_reads
+
+#################################
+#  Aligner
+#################################
+
+
+def align_reads_star(sample_name, file_names, alignment_output_dir):
+    # If paired read flag is required
+    # paired_read = True if len(file_names) == 2 else False
+
+    print("Aligning reads...")
+    aligner_args = "{app_folder}/STAR/STAR --runThreadN 4 {aligner_extra_args} --genomeDir {index_folder} " \
+                   "--readFilesIn {fastq_file_names} --outFileNamePrefix {output_folder} --outSAMtype BAM Unsorted".\
+        format(app_folder=APPLICATION_FOLDER,
+               aligner_extra_args="" if parser_result.aligner_extra_args is None else parser_result.aligner_extra_args,
+               index_folder=GENOME_REFERENCES_FOLDER + "/star_index",
+               fastq_file_names=" ".join(file_names),
+               output_folder=alignment_output_dir + "/")
+    print("Command: " + aligner_args)
+    aligner_process = Popen(shlex.split(aligner_args), stdout=PIPE, stderr=PIPE)
+    aligner_out, aligner_error = aligner_process.communicate()
+
+    if aligner_process.returncode != 0:
+        raise ValueError("STAR failed to complete (Non-zero return code)!\n"
+                         "STAR stdout: {std_out} \nSTAR stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                  std_err=aligner_error.decode("utf8")))
+
+    if aligner_error.decode("utf8").strip() != "" or not os.path.isfile(alignment_output_dir + "/Log.final.out"):
+        raise ValueError("STAR failed to complete (No output file is found)!\n"
+                         "STAR stdout: {std_out} \nSTAR stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                  std_err=aligner_error.decode("utf8")))
+
+    print('Completed reads alignment')
+
+    bam_file_name_output = "Aligned.out.bam"
+
+    return bam_file_name_output
+
+
+def align_reads_hisat(sample_name, file_names, alignment_output_dir):
+    # If paired read flag is required
+    paired_read = True if len(file_names) == 2 else False
+
+    print("Aligning reads...")
+    if paired_read:
+        fastq_file_args = "-1 {} -2 {}".format(*file_names)
+    else:
+        fastq_file_args = "-U {}".format(*file_names)
+
+    aligner_args = "{app_folder}/hisat/hisat2 -p 4 --tmo {aligner_extra_args} -x {index_folder}/hisat2.index " \
+                   "{fastq_file_names} -S {output_folder}/output.sam".\
+        format(app_folder=APPLICATION_FOLDER,
+               aligner_extra_args="" if parser_result.aligner_extra_args is None else parser_result.aligner_extra_args,
+               index_folder=GENOME_REFERENCES_FOLDER + "/hisat_index",
+               fastq_file_names=fastq_file_args,
+               output_folder=alignment_output_dir)
+    print("Command: " + aligner_args)
+    aligner_process = Popen(shlex.split(aligner_args), stdout=PIPE, stderr=PIPE)
+    aligner_out, aligner_error = aligner_process.communicate()
+
+    if aligner_process.returncode != 0:
+        raise ValueError("HISAT2 failed to complete (Non-zero return code)!\n"
+                         "HISAT2 stdout: {std_out} \nHISAT2 stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                      std_err=aligner_error.decode("utf8")))
+    print('Completed reads alignment')
+
+    samtools_args = "{app_folder}/samtools/samtools view -@ 4 -o {output_folder}/output.bam {output_folder}/output.sam". \
+        format(app_folder=APPLICATION_FOLDER,
+               output_folder=alignment_output_dir)
+    print("Command: " + samtools_args)
+    samtools_process = Popen(shlex.split(samtools_args), stdout=PIPE, stderr=PIPE)
+    samtools_out, samtools_error = samtools_process.communicate()
+
+    if samtools_process.returncode != 0:
+        raise ValueError("Samtools failed to complete (Non-zero return code)!\n"
+                         "Samtools stdout: {std_out} \nSamtools stderr: {std_err}".format(
+            std_out=samtools_out.decode("utf8"), std_err=samtools_error.decode("utf8")))
+
+    sam_file_name_output = "output.bam"
+
+    return sam_file_name_output
+
+
+def align_reads_subread(sample_name, file_names, alignment_output_dir):
+    # If paired read flag is required
+    paired_read = True if len(file_names) == 2 else False
+
+    print("Aligning reads...")
+    print("Aligning with subread")
+    if paired_read:
+        fastq_file_args = "-r {} -R {}".format(*file_names)
+    else:
+        fastq_file_args = "-r {}".format(*file_names)
+
+    aligner_args = "{app_folder}/subread/subread-align -T 4 -t 0 --SAMoutput {aligner_extra_args} " \
+                   "-i {index_folder}/genome {fastq_file_names} -o {output_folder}/output.bam".\
+        format(app_folder=APPLICATION_FOLDER,
+               aligner_extra_args="" if parser_result.aligner_extra_args is None else parser_result.aligner_extra_args,
+               index_folder=GENOME_REFERENCES_FOLDER + "/subread_index",
+               fastq_file_names=fastq_file_args,
+               output_folder=alignment_output_dir)
+    print("Command: " + aligner_args)
+    aligner_process = Popen(shlex.split(aligner_args), stdout=PIPE, stderr=PIPE)
+    aligner_out, aligner_error = aligner_process.communicate()
+
+    if aligner_process.returncode != 0:
+        raise ValueError("Subread failed to complete (Non-zero return code)!\n"
+                         "Subread stdout: {std_out} \nSubread stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                        std_err=aligner_error.decode("utf8")))
+    print('Completed reads alignment')
+
+    sam_file_name_output = "output.bam"
+
+    return sam_file_name_output
+
+
+#################################
+#  Main functions
+#################################
+
+
+def alignment_step(keyval):
+    # Input: file_name, file_content as key,val
+    # Output: [sample_name, file_name] as [key,val]
+    global parser_result
+
+    prefix_regex = r"(.*_part[0-9]*)\."
+
+    file_name, file_content = keyval
+    prefix_match = re.findall(prefix_regex, file_name.rstrip("/").split("/")[-1])
+
+    if len(prefix_match) != 1:
+        raise ValueError("Filename can not be resolved (invalid, pattern mismatch): {}".format(file_name))
+
+    prefix = prefix_match[0]
+    sample_name = prefix.rsplit("_part", 1)[0]
+
+    alignment_dir = TEMP_OUTPUT_FOLDER + "/alignment_" + prefix
+
+    try:
+        os.mkdir(alignment_dir)
+    except:
+        print('Alignment directory {} exist.'.format(alignment_dir))
+
+    print("Recreating FASTQ file(s)")
+    split_file_names, paired_reads = split_interleaved_file(prefix, file_content, alignment_dir)
+    print("Recreating FASTQ file(s) complete. Files recreated: {}".format(",".join(split_file_names)))
+
+    alignment_output_dir = alignment_dir + "/aligner_output"
+
+    try:
+        os.mkdir(alignment_output_dir)
+    except:
+        print('Alignment output directory {} exist.'.format(alignment_output_dir))
+
+    if parser_result.aligner.lower() == "star":
+        aligned_sam_output = align_reads_star(sample_name, split_file_names, alignment_output_dir)
+    elif parser_result.aligner.lower() == "hisat" or parser_result.aligner.lower() == "hisat2":
+        aligned_sam_output = align_reads_hisat(sample_name, split_file_names, alignment_output_dir)
+    elif parser_result.aligner.lower() == "subread":
+        aligned_sam_output = align_reads_subread(sample_name, split_file_names, alignment_output_dir)
+    else:
+        print("Aligner specified is not yet supported. Defaulting to STAR")
+        aligned_sam_output = align_reads_star(sample_name, split_file_names, alignment_output_dir)
+
+    aligned_output_filepath = "{}/{}".format(alignment_output_dir.rstrip("/"), aligned_sam_output)
+    aligned_output_hdfs_filepath = "{}/{}".format(HDFS_TEMP_OUTPUT_FOLDER, prefix)
+
+    subprocess.call(["hdfs", "dfs", "-rm", aligned_output_hdfs_filepath])
+    subprocess.call(["hdfs", "dfs", "-put", aligned_output_filepath, aligned_output_hdfs_filepath])
+
+    shutil.rmtree(alignment_dir, ignore_errors=True)
+    return sample_name, [prefix]
+
+
+def fuse_alignment(keyval):
+    # Input: sample_name, [file_name,...] as key, val
+    # Output: sample_name
+    global parser_result
+
+    key, file_lists = keyval
+    fuse_alignment_dir = TEMP_OUTPUT_FOLDER.rstrip("/") + "/" + key
+
+    ordered_file_lists = sorted([(f, int(f.rsplit("part", 1)[-1])) for f in file_lists], key=lambda x:x[-1])
+    print(ordered_file_lists)
+
+    try:
+        os.mkdir(fuse_alignment_dir)
+    except:
+        print('Fuse alignment directory {} exist.'.format(fuse_alignment_dir))
+
+    fuse_alignment_file = key + ".bam"
+
+    previous_file_path = ""
+    for index, file_name_pair in enumerate(ordered_file_lists):
+        file_name, number = file_name_pair
+        local_file_path = fuse_alignment_dir + "/" + file_name + ".bam"
+        subprocess.call(["hdfs", "dfs", "-get", HDFS_TEMP_OUTPUT_FOLDER.rstrip("/") + "/" + file_name, local_file_path])
+
+        if index != 0:
+            new_merged_file_path = "{}/temp_{}.bam".format(fuse_alignment_dir, index)
+            subprocess.call(["samtools", "cat", "-o", new_merged_file_path, previous_file_path, local_file_path])
+
+            os.remove(previous_file_path)
+            os.remove(local_file_path)
+            previous_file_path = new_merged_file_path
+        else:
+            previous_file_path = local_file_path
+
+        subprocess.call(["hdfs", "dfs", "-rm", HDFS_TEMP_OUTPUT_FOLDER.rstrip("/") + "/" + file_name])
+
+    if parser_result.output_dir.startswith("s3://"):  # From S3
+        s3_client = boto3.client('s3', region_name=parser_result.aws_region)
+        print("uploading to S3")
+        output_bucket, key_prefix = parser_result.output_dir.strip().strip("/")[5:].split("/", 1)
+        s3_client.upload_file(previous_file_path, output_bucket, key_prefix + "/" + fuse_alignment_file)
+    else:
+        print("outputting to HDFS")
+        subprocess.call(["hdfs", "dfs", "-mkdir", "-p", parser_result.output_dir.rstrip("/")])
+        subprocess.call(["hdfs", "dfs", "-put", previous_file_path, parser_result.output_dir.rstrip("/") + "/" +
+                         fuse_alignment_file])
+
+    os.remove(previous_file_path)
+    return key
+
+if __name__ == "__main__":
+    global parser_result
+
+    parser = argparse.ArgumentParser(description='Spark-based RNA-seq Pipeline Alignment')
+    parser.add_argument('--input', '-i', action="store", dest="input_dir", help="Input directory - HDFS or S3")
+    parser.add_argument('--output', '-o', action="store", dest="output_dir", help="Output directory - HDFS or S3")
+    parser.add_argument('--aligner_tools', '-at', action="store", dest="aligner", nargs='?',
+                        help="Aligner to be used (STAR|HISAT2|Subread)", default="STAR")
+    parser.add_argument('--aligner_extra_args', '-s', action="store", dest="aligner_extra_args", nargs='?',
+                        help="Extra argument to be passed to alignment tool", default="")
+    parser.add_argument('--region', '-r', action="store", dest="aws_region", help="AWS region")
+
+    parser_result = parser.parse_args()
+
+    split_num = 0
+
+    conf = SparkConf().setAppName("Spark-based RNA-seq Pipeline Alignment")
+    sc = SparkContext(conf=conf)
+
+    if parser_result.input_dir.startswith("s3://"):  # From S3
+        s3_client = boto3.client('s3', region_name=parser_result.aws_region)
+        # Get number of input files
+        s3_paginator = s3_client.get_paginator('list_objects')
+        input_bucket, key_prefix = parser_result.input_dir[5:].strip().split("/", 1)
+
+        input_file_num = 0
+
+        for result in s3_paginator.paginate(Bucket=input_bucket, Prefix=key_prefix):
+            for file in result.get("Contents"):
+                input_file_num += 1
+
+        if input_file_num == 0:
+            raise ValueError("Input directory is invalid or empty!")
+
+        split_num = input_file_num
+    else:  # From HDFS
+        hdfs_process = Popen(shlex.split("hdfs dfs -count {}".format(parser_result.input_dir)),
+                             stdout=PIPE, stderr=PIPE)
+        hdfs_out, hdfs_error = hdfs_process.communicate()
+
+        if hdfs_error:
+           raise ValueError("Input directory is invalid or empty!")
+
+        dir_count, file_count, size, path = hdfs_out.strip().split()
+
+        split_num = int(file_count)
+
+    subprocess.call(["hdfs", "dfs", "-mkdir", "-p", HDFS_TEMP_OUTPUT_FOLDER])
+
+    input_files = sc.wholeTextFiles(parser_result.input_dir, split_num)
+
+    aligned_files = input_files.map(alignment_step)
+    aligned_file_lists = aligned_files.reduceByKey(add)
+    aligned_samples = aligned_file_lists.map(fuse_alignment)
+    aligned_samples.collect()
diff --git a/source/spark_runner/run_pipeline_assembly.py b/source/spark_runner/run_pipeline_assembly.py
new file mode 100644
index 0000000..f713e6d
--- /dev/null
+++ b/source/spark_runner/run_pipeline_assembly.py
@@ -0,0 +1,693 @@
+import argparse
+import datetime
+import os
+import sys
+import shlex
+import shutil
+from subprocess import Popen, PIPE
+from operator import add
+from pyspark import SparkContext, SparkConf
+import pyspark.serializers
+import subprocess
+import boto3
+import re
+import pysam
+
+global parser_result
+
+if sys.version > "3.4":
+    pyspark.serializers.protocol = 4
+
+APPLICATION_FOLDER = "/app"
+GENOME_REFERENCES_FOLDER = "/mnt/ref"
+TEMP_OUTPUT_FOLDER = "/mnt/output"
+
+BIN_BP_SIZE = 320000
+BIN_BP_OFFSET = 64000 # Note: offset must never be greater than (BIN_BP_SIZE/2)+1!
+BIN_BP_SIZE_W_OFFSET = BIN_BP_SIZE-BIN_BP_OFFSET
+MINIMUM_READS_IN_BIN = 50
+CHROMOSOME_PATTERN = r"^chr[0-9XY]+"
+
+
+#################################
+#  File splitting
+#################################
+
+
+def split_interleaved_file(file_prefix, file_content, output_dir):
+    """
+    Unpacks an interleaved file into the standard FASTQ format
+    :param file_prefix: the prefix of the file name
+    :param file_content: the lines of content from the input file
+    :param output_dir: the location to store the unpacked files
+    :return: a tuple with first element being a list of output file names
+    (1 for se, 2 for pe); 2nd element a boolean flag - True if pe data,
+    False otherwise
+    """
+    fastq_line_count_se = 4
+    fastq_line_count_pe = 8
+    paired_reads = False
+    output_file_names = []
+
+    file_prefix = output_dir + "/" + file_prefix
+    output_file = file_prefix + "_1.fq"
+    output_file_names.append(output_file)
+    output_file_writer = open(output_file, 'w')
+
+    count = 0
+    for line in file_content.strip().split("\n"):
+        # In the first line, check if it's paired or not
+        if count == 0 and len(line.strip().split("\t")) == fastq_line_count_pe:
+            paired_reads = True
+            output_file_pair = file_prefix + "_2.fq"
+            output_file_names.append(output_file_pair)
+            output_pair_writer = open(output_file_pair, 'w')
+
+        if paired_reads:
+            parts = line.strip().split("\t")
+
+            if len(parts) != fastq_line_count_pe:
+                continue
+
+            read_one = parts[:fastq_line_count_se]
+            read_two = parts[fastq_line_count_se:]
+            output_file_writer.write("\n".join(read_one) + "\n")
+            output_pair_writer.write("\n".join(read_two) + "\n")
+        else:
+            output_file_writer.writelines(line.strip().replace("\t", "\n") + "\n")
+
+        count += 1
+
+    output_file_writer.close()
+    if paired_reads:
+        output_pair_writer.close()
+
+    return output_file_names, paired_reads
+
+
+#################################
+#  Aligner
+#################################
+
+
+def align_reads_star(file_names, alignment_output_dir):
+    # If paired read flag is required
+    # paired_read = True if len(file_names) == 2 else False
+
+    print("Aligning reads...")
+    aligner_args = "{app_folder}/STAR/STAR --runThreadN 4 --outSAMstrandField intronMotif {aligner_extra_args} " \
+                   "--genomeDir {index_folder} --readFilesIn {fastq_file_names} --outFileNamePrefix {output_folder}". \
+        format(app_folder=APPLICATION_FOLDER,
+               aligner_extra_args="" if parser_result.aligner_extra_args is None else parser_result.aligner_extra_args,
+               index_folder=GENOME_REFERENCES_FOLDER + "/star_index",
+               fastq_file_names=" ".join(file_names),
+               output_folder=alignment_output_dir + "/")
+    print("Command: " + aligner_args)
+    aligner_process = Popen(shlex.split(aligner_args), stdout=PIPE, stderr=PIPE)
+    aligner_out, aligner_error = aligner_process.communicate()
+
+    if aligner_process.returncode != 0:
+        raise ValueError(
+            "STAR failed to complete (Non-zero return code)!\n"
+            "STAR stdout: {std_out} \n"
+            "STAR stderr: {std_err}".
+            format(std_out=aligner_out.decode("utf8"), std_err=aligner_error.decode("utf8"))
+        )
+
+    if aligner_error.decode("utf8").strip() != "" or not os.path.isfile(alignment_output_dir + "/Log.final.out"):
+        raise ValueError("STAR failed to complete (No output file is found)!\n"
+                         "STAR stdout: {std_out} \n"
+                         "STAR stderr: {std_err}".
+                         format(std_out=aligner_out.decode("utf8"), std_err=aligner_error.decode("utf8")))
+
+    print('Completed reads alignment')
+
+    sam_file_name_output = "Aligned.out.sam"
+    return sam_file_name_output
+
+
+def align_reads_hisat(file_names, alignment_output_dir):
+    # If paired read flag is required
+    paired_read = True if len(file_names) == 2 else False
+
+    print("Aligning reads...")
+    if paired_read:
+        fastq_file_args = "-1 {} -2 {}".format(*file_names)
+    else:
+        fastq_file_args = "-U {}".format(*file_names)
+
+    aligner_args = "{app_folder}/hisat/hisat2 -p 4 --no-unal --no-mixed {aligner_extra_args} " \
+                   "-x {index_folder}/hisat2.index {fastq_file_names} -S {output_folder}/output.sam". \
+        format(app_folder=APPLICATION_FOLDER,
+               aligner_extra_args="" if parser_result.aligner_extra_args is None else parser_result.aligner_extra_args,
+               index_folder=GENOME_REFERENCES_FOLDER + "/hisat_index",
+               fastq_file_names=fastq_file_args,
+               output_folder=alignment_output_dir)
+    print("Command: " + aligner_args)
+    aligner_process = Popen(shlex.split(aligner_args), stdout=PIPE, stderr=PIPE)
+    aligner_out, aligner_error = aligner_process.communicate()
+
+    if aligner_process.returncode != 0:
+        raise ValueError(
+            "HISAT2 failed to complete (Non-zero return code)!\n"
+            "HISAT2 stdout: {std_out} \n"
+            "HISAT2 stderr: {std_err}".
+            format(std_out=aligner_out.decode("utf8"), std_err=aligner_error.decode("utf8"))
+        )
+    print('Completed reads alignment')
+
+    sam_file_name_output = "output.sam"
+    return sam_file_name_output
+
+
+def align_reads_subread(file_names, alignment_output_dir):
+    # If paired read flag is required
+    paired_read = True if len(file_names) == 2 else False
+
+    print("Aligning reads...")
+    if paired_read:
+        fastq_file_args = "-r {} -R {}".format(*file_names)
+    else:
+        fastq_file_args = "-r {}".format(*file_names)
+
+    aligner_args = "{app_folder}/subread/subread-align -T 4 -t 0 --SAMoutput -u {aligner_extra_args} " \
+                   "-i {index_folder}/genome {fastq_file_names} -o {output_folder}/output.sam". \
+        format(app_folder=APPLICATION_FOLDER,
+               aligner_extra_args="" if parser_result.aligner_extra_args is None else parser_result.aligner_extra_args,
+               index_folder=GENOME_REFERENCES_FOLDER + "/subread_index",
+               fastq_file_names=fastq_file_args,
+               output_folder=alignment_output_dir)
+    print("Command: " + aligner_args)
+    aligner_process = Popen(shlex.split(aligner_args), stdout=PIPE, stderr=PIPE)
+    aligner_out, aligner_error = aligner_process.communicate()
+
+    if aligner_process.returncode != 0:
+        raise ValueError(
+            "Subread failed to complete (Non-zero return code)!\n"
+            "Subread stdout: {std_out} \n"
+            "Subread stderr: {std_err}".format(
+             std_out=aligner_out.decode("utf8"), std_err=aligner_error.decode("utf8"))
+        )
+    print('Completed reads alignment')
+
+    sam_file_name_output = "output.sam"
+    return sam_file_name_output
+
+
+#################################
+#  Binning
+#################################
+
+
+def bin_reads(aligned_output_filepath, paired_reads):
+    binned_reads = []
+
+    def process_read(read_one, read_two=None):
+        if read_one.reference_id in chromosome_list_broadcast.value:
+            read_start = read_one.reference_start
+            read_end = read_one.reference_end
+            read_one.query_sequence = ""
+            read_one.query_qualities = ""
+
+            if read_two:
+                if read_two.reference_start < read_start:
+                    read_start = read_two.reference_start
+
+                if read_two.reference_end > read_end:
+                    read_end = read_two.reference_end
+
+                read_two.query_sequence = ""
+                read_two.query_qualities = ""
+
+            if parser_result.enable_tiling:
+                start_bin_number = max(read_start-BIN_BP_OFFSET, 0) // BIN_BP_SIZE_W_OFFSET
+                end_bin_number = read_end // BIN_BP_SIZE_W_OFFSET
+            else:
+                start_bin_number = read_start // BIN_BP_SIZE
+                end_bin_number = read_end // BIN_BP_SIZE
+
+            for bin_number in range(start_bin_number, end_bin_number+1):
+                binned_reads.append(((read_one.reference_id, bin_number), [read_one.tostring() + "\n"]))
+                if read_two:
+                    binned_reads.append(((read_two.reference_id, bin_number), [read_two.tostring() + "\n"]))
+
+    with pysam.AlignmentFile(aligned_output_filepath) as alignment_file:
+        if paired_reads:
+            previous_read = None
+            for index, read in enumerate(alignment_file):
+                if index % 2 == 1:
+                    # For improperly paired reads (across multiple chromosome), we will treat them like single reads
+                    if previous_read.reference_id != read.reference_id:
+                        process_read(previous_read)
+                        process_read(read)
+                    else:
+                        process_read(previous_read, read)
+                else:
+                    previous_read = read
+        else:
+            for read in alignment_file:
+                process_read(read)
+
+    return binned_reads
+
+
+#################################
+#  Assembly
+#################################
+
+
+def assemble_transcripts_stringtie(bin_id, aligned_output_filepath, assembler_output_dir, reference_gtf_filepath=None):
+    print("Assembling reads...")
+    assembler_command = "{app_folder}/stringtie/stringtie -p 8 -l {prefix} {assembler_extra_args} {reference_gtf} " \
+                        "{aligned_file}".\
+        format(app_folder=APPLICATION_FOLDER,
+               prefix=bin_id,
+               assembler_extra_args=parser_result.assembler_extra_args,
+               reference_gtf="-G " + reference_gtf_filepath if reference_gtf_filepath else "",
+               aligned_file=aligned_output_filepath)
+    print("Command: " + assembler_command)
+    assembler_process = Popen(shlex.split(assembler_command), stdout=PIPE, stderr=PIPE)
+    assembler_out, assembler_error = assembler_process.communicate()
+
+    if assembler_process.returncode != 0:
+        raise ValueError('Stringtie failed to complete (non-zero return):\n'
+                         'Stringtie stdout: {std_out}\n'
+                         'Stringtie stderr: {std_err}'.
+                         format(std_out=assembler_out.decode("utf8"), std_err=assembler_error.decode("utf8")))
+
+    if assembler_error.decode("utf8").strip() != "":
+        raise ValueError('Stringtie failed to complete (error):\n'
+                         'Stringtie stdout: {std_out}\n'
+                         'Stringtie stderr: {std_err}'.
+                         format(std_out=assembler_out.decode("utf8"), std_err=assembler_error.decode("utf8")))
+
+    return assembler_out.decode("utf8")
+
+
+def assemble_transcripts_scallop(bin_id, aligned_output_filepath, assembler_output_dir):
+    print("Assembling reads...")
+    output_gtf = assembler_output_dir + "/output.gtf"
+    assembler_command = "{app_folder}/scallop/scallop -i {sam_file} -o {out_gtf} {assembler_extra_args} --verbose 0".\
+        format(app_folder=APPLICATION_FOLDER,
+               sam_file=aligned_output_filepath,
+               out_gtf=output_gtf,
+               assembler_extra_args=parser_result.assembler_extra_args.strip())
+    print("Command: " + assembler_command)
+    assembler_process = Popen(shlex.split(assembler_command), stdout=PIPE, stderr=PIPE)
+    assembler_out, assembler_error = assembler_process.communicate()
+
+    if assembler_process.returncode != 0:
+        raise ValueError('scallop failed to complete (non-zero return):\n'
+                         'Scallop stdout: {std_out}\n'
+                         'Scallop stderr: {std_err}'.
+                         format(std_out=assembler_out.decode("utf8"), std_err=assembler_error.decode("utf8")))
+
+    if not os.path.isfile(output_gtf):
+        raise ValueError('Scallop failed to complete (no output file):\n'
+                         'Scallop stdout: {std_out}\n'
+                         'Scallop stderr: {std_err}'.
+                         format(std_out=assembler_out.decode("utf8"), std_err=assembler_error.decode("utf8")))
+
+    # We need to give each GTF entry a unique ID for the final merging using the region
+    annotation_output = ""
+    with open(output_gtf) as gtf:
+        for line in gtf:
+            if not line.startswith("#"):
+                line = line.replace('"gene', '"{}'.format(bin_id))
+            annotation_output += line
+
+    return annotation_output
+
+
+#################################
+#  Augmenting Annotation
+#################################
+
+
+def merge_reference_annotation(assembled_transcript):
+    assembled_transcript_path = "all_transcripts.gtf"
+    merged_transcript_path = "reference.gtf"
+
+    # Create a GTF file to pass to StringTie.
+    with open(assembled_transcript_path, 'w') as gtf_file:
+        gtf_file.writelines(assembled_transcript)
+    assembled_transcript.clear()
+
+    # Merge the reference annotation and transcripts from all bins.
+    merge_command = "{app_folder}/stringtie/stringtie -p 32 {assembler_extra_args} --merge {assembled_transcript} " \
+                    "-G {genome_ref_folder}/{annotation_file} -o {merged_transcript}".\
+        format(app_folder=APPLICATION_FOLDER,
+               assembler_extra_args=parser_result.assembler_merge_extra_args,
+               assembled_transcript=assembled_transcript_path,
+               genome_ref_folder=GENOME_REFERENCES_FOLDER + "/genome_ref",
+               annotation_file=parser_result.annotation_file,
+               merged_transcript=merged_transcript_path)
+    print("Command: " + merge_command)
+    merge_process = Popen(shlex.split(merge_command), stdout=PIPE, stderr=PIPE)
+    merge_out, merge_error = merge_process.communicate()
+
+    if merge_process.returncode != 0:
+        raise ValueError('Stringtie (merge) failed to complete (non-zero return):\n'
+                         'Stringtie stdout: {std_out}\n'
+                         'Stringtie stderr: {std_err}\n'.
+                         format(std_out=merge_out.decode("utf8"), std_err=merge_error.decode("utf8")))
+
+    if merge_error.decode("utf8").strip() != "" or not os.path.isfile(merged_transcript_path):
+        raise ValueError('Stringtie (merge) failed to complete (error):\n'
+                         'Stringtie stdout: {std_out}\n'
+                         'Stringtie stderr: {std_err}\n'.
+                         format(std_out=merge_out.decode("utf8"), std_err=merge_error.decode("utf8")))
+
+    return assembled_transcript_path, merged_transcript_path
+
+
+def extract_reference_gtf(chromosome_name, assembler_output_dir):
+    gtf_reference_extracted_path = assembler_output_dir + "/" + chromosome_name + "_annotation.gtf"
+
+    gtf_extract_command = """awk '$1 == "{chr_name}"' {genome_ref_folder}/{annotation_file}""".\
+        format(chr_name=chromosome_name,
+               genome_ref_folder=GENOME_REFERENCES_FOLDER + "/genome_ref",
+               annotation_file=parser_result.annotation_file)
+    print("Command: " + gtf_extract_command)
+
+    with open(gtf_reference_extracted_path, "w") as output_bam:
+        gtf_extract_process = Popen(shlex.split(gtf_extract_command), stdout=output_bam, stderr=PIPE)
+        gtf_extract_out, gtf_extract_error = gtf_extract_process.communicate()
+
+    if gtf_extract_process.returncode != 0 or gtf_extract_error.decode("utf8").strip() != "":
+        print("Extraction failed! Using full annotation file instead.\n"
+              "Stdout: {std_out}\n"
+              "Stderr: {std_err}".format(std_out=gtf_extract_out.decode("utf8"),
+                                         std_err=gtf_extract_error.decode("utf8")))
+
+        gtf_reference_extracted_path = "{genome_ref_folder}/{annotation_file}". \
+            format(genome_ref_folder=GENOME_REFERENCES_FOLDER + "/genome_ref",
+                   annotation_file=parser_result.annotation_file)
+
+    # No reference data exist for the given chromosome. Will use de-novo argument instead.
+    if os.stat(gtf_reference_extracted_path).st_size == 0:
+        gtf_reference_extracted_path = None
+
+    return gtf_reference_extracted_path
+
+
+def analyse_transcripts(merged_transcript_path, analysis_output_dir):
+    """
+    Analyses the accuracy and precision of a generated GTF, with
+    respect to a reference genome. Uses GFFCompare.
+    :param merged_transcript_path: updated GTF
+    :param analysis_output_dir: output directory
+    :return: file path
+    """
+    # Merge the reference annotation and transcripts from all bins.
+    gffcompare_command = "{app_folder}/gffcompare/gffcompare -T -r {genome_ref_folder}/{annotation_file} " \
+                         "-o {analysis_output_dir}/gffcomp {merged_transcript}".\
+        format(app_folder=APPLICATION_FOLDER,
+               genome_ref_folder=GENOME_REFERENCES_FOLDER + "/genome_ref",
+               annotation_file=parser_result.annotation_file,
+               analysis_output_dir=analysis_output_dir,
+               merged_transcript=merged_transcript_path)
+    print("Command: " + gffcompare_command)
+    gffcompare_process = Popen(shlex.split(gffcompare_command), stdout=PIPE, stderr=PIPE)
+    gffcompare_out, gffcompare_error = gffcompare_process.communicate()
+
+    if gffcompare_process.returncode != 0:
+        raise ValueError('GFFcompare failed to complete (non-zero return):\n'
+                         'GFFcompare stdout: {std_out}\n'
+                         'GFFcompare stderr: {std_err}\n'.
+                         format(std_out=gffcompare_out.decode("utf8"),std_err=gffcompare_error.decode("utf8")))
+
+    gffcompare_stats_output = "gffcomp.stats"
+    if gffcompare_error.decode("utf8").strip() != "" or not os.path.isfile(analysis_output_dir + "/" +
+                                                                           gffcompare_stats_output):
+        raise ValueError('GFFcompare failed to complete (error):\n'
+                         'GFFcompare stdout: {std_out}\n'
+                         'GFFcompare stderr: {std_err}\n'.
+                         format(std_out=gffcompare_out.decode("utf8"), std_err=gffcompare_error.decode("utf8")))
+
+    return gffcompare_stats_output
+
+#################################
+#  Main functions
+#################################
+
+
+def create_sam_header():
+    alignment_dir = TEMP_OUTPUT_FOLDER + "/sam_header"
+
+    try:
+        os.mkdir(alignment_dir)
+    except:
+        print('Alignment directory {} exist.'.format(alignment_dir))
+
+    file_name = os.path.join(alignment_dir, "empty.sam")
+    if os.path.isfile(file_name):
+        os.remove(file_name)
+
+    os.mknod(file_name)
+
+    if parser_result.aligner.lower() == "star":
+        aligned_sam_output = align_reads_star([file_name], alignment_dir)
+    elif parser_result.aligner.lower() == "hisat" or parser_result.aligner.lower() == "hisat2":
+        aligned_sam_output = align_reads_hisat([file_name], alignment_dir)
+    elif parser_result.aligner.lower() == "subread":
+        aligned_sam_output = align_reads_subread([file_name], alignment_dir)
+    else:
+        print("Aligner specified is not yet supported. Defaulting to STAR")
+        aligned_sam_output = align_reads_star([file_name], alignment_dir)
+
+    aligned_output_filepath = os.path.join(alignment_dir, aligned_sam_output)
+    with pysam.AlignmentFile(aligned_output_filepath) as f:
+        sam_header = str(f.header)
+
+        chromosome_list = {}
+        for index, reference_name in enumerate(f.header.references):
+            if re.search(CHROMOSOME_PATTERN, reference_name) is not None:
+                chromosome_list[index] = reference_name
+
+    shutil.rmtree(alignment_dir, ignore_errors=True)
+    return sam_header, chromosome_list
+
+
+def alignment_bin_step(keyval):
+    # Input: file_name, file_content as key,val
+    # Output: [sample_name\tgene, count] as [key,val]
+    global parser_result
+
+    prefix_regex = r"(.*_part[0-9]*)\."
+
+    file_name, file_content = keyval
+    prefix_match = re.findall(prefix_regex, file_name.rstrip("/").split("/")[-1])
+
+    if len(prefix_match) != 1:
+        raise ValueError("Filename can not be resolved (invalid, pattern mismatch): {}".format(file_name))
+
+    prefix = prefix_match[0]
+
+    alignment_dir = TEMP_OUTPUT_FOLDER + "/alignment_" + prefix
+    try:
+        os.mkdir(alignment_dir)
+    except:
+        print('Alignment directory {} exist.'.format(alignment_dir))
+
+    print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Recreating FASTQ file(s)")
+    split_file_names, paired_reads = split_interleaved_file(prefix, file_content, alignment_dir)
+    print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" +
+          "Recreating FASTQ file(s) complete. Files recreated: {}".format(",".join(split_file_names)))
+
+    alignment_output_dir = alignment_dir + "/aligner_output"
+
+    try:
+        os.mkdir(alignment_output_dir)
+    except:
+        print('Alignment output directory {} exist.'.format(alignment_output_dir))
+
+    if parser_result.aligner.lower() == "star":
+        aligned_sam_output = align_reads_star(split_file_names, alignment_output_dir)
+    elif parser_result.aligner.lower() == "hisat" or parser_result.aligner.lower() == "hisat2":
+        aligned_sam_output = align_reads_hisat(split_file_names, alignment_output_dir)
+    elif parser_result.aligner.lower() == "subread":
+        aligned_sam_output = align_reads_subread(split_file_names, alignment_output_dir)
+    else:
+        print("Aligner specified is not yet supported. Defaulting to STAR")
+        aligned_sam_output = align_reads_star(split_file_names, alignment_output_dir)
+
+    aligned_output_filepath = os.path.join(alignment_output_dir, aligned_sam_output)
+
+    print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Binning reads")
+    binned_reads = bin_reads(aligned_output_filepath, paired_reads)
+    print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Binning reads done")
+
+    print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Alignment of reads for {} done.".format(prefix))
+    shutil.rmtree(alignment_dir, ignore_errors=True)
+    return binned_reads
+
+
+def assemble_transcripts(keyval):
+    """
+    Applies Stringtie to formulate transcripts.
+    :param region_reads:
+    :return: (chromosome_region, gtf_output) tuple
+    """
+    bin_id, read_list = keyval
+    chromosome_id, bin_number = bin_id
+    chromosome_name = chromosome_list_broadcast.value[chromosome_id]
+
+    output = []
+
+    if len(read_list) >= MINIMUM_READS_IN_BIN:
+        print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Starting transcript assembly for {}.".format(bin_id))
+        sample_prefix = "{}_{}".format(chromosome_name, bin_number)
+        assembler_output_dir = TEMP_OUTPUT_FOLDER + '/assembly_' + sample_prefix
+        try:
+            os.makedirs(assembler_output_dir)
+        except:
+            print("Assembler output directory already exists for %s." % sample_prefix)
+
+        aligned_output_file = assembler_output_dir + '/output.sam'
+        aligned_sorted_output_file = assembler_output_dir + '/output.sorted.bam'
+
+        with open(aligned_output_file, 'w') as sam_outfile:
+            sam_outfile.write(sam_header_broadcast.value.strip() + "\n")
+            sam_outfile.writelines(read_list)
+
+        read_list.clear()
+
+        print('{0:%Y-%m-%d %H:%M:%S}'.format(
+            datetime.datetime.now()) + ":" + "Sorting and converting sam file.".format(bin_id))
+        pysam.sort("-@", '4', "-m", "1G", "-o", aligned_sorted_output_file, aligned_output_file)
+
+        print('{0:%Y-%m-%d %H:%M:%S}'.format(
+            datetime.datetime.now()) + ":" + "Running stringtie.".format(bin_id))
+
+        if parser_result.assembler.lower() == "stringtie":
+            # If we are performing genome-guided assembly, build a smaller GTF file.
+            reference_gtf_filepath = None
+            if parser_result.assembler_use_reference:
+                reference_gtf_filepath = extract_reference_gtf(chromosome_name, assembler_output_dir)
+
+            gtf_output = assemble_transcripts_stringtie(sample_prefix, aligned_sorted_output_file, assembler_output_dir,
+                                                        reference_gtf_filepath)
+        elif parser_result.assembler.lower() == "scallop":
+            gtf_output = assemble_transcripts_scallop(sample_prefix, aligned_sorted_output_file, assembler_output_dir)
+
+        shutil.rmtree(assembler_output_dir, ignore_errors=True)
+        print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Assembly of transcript for {} done.".format(bin_id))
+        output.append((bin_id, gtf_output.strip()+"\n"))
+
+    return output
+
+
+if __name__ == "__main__":
+    global parser_result
+
+    parser = argparse.ArgumentParser(description='Spark-based RNA-seq Pipeline')
+    parser.add_argument('--input', '-i', action="store", dest="input_dir", help="Input directory - HDFS or S3")
+    parser.add_argument('--output', '-o', action="store", dest="output_dir", help="Output directory - HDFS or S3")
+    parser.add_argument('--annotation', '-a', action="store", dest="annotation_file",
+                        help="Name of annotation file to be used")
+    parser.add_argument('--enable-tiling', '-et', action="store_true", dest="enable_tiling",
+                        help="Enable tiling of genome bins")
+    parser.add_argument('--enable-analysis', '-ea', action="store_true", dest="enable_analysis",
+                        help="Generate stats on updated GTF w.r.t. reference GTF")
+    parser.add_argument('--aligner-tools', '-at', action="store", dest="aligner", nargs='?',
+                        help="Aligner to be used (STAR|HISAT2|Subread)", default="STAR")
+    parser.add_argument('--aligner-extra-args', '-s', action="store", dest="aligner_extra_args", nargs='?',
+                        help="Extra argument to be passed to alignment tool", default="")
+    parser.add_argument('--assembler-tools', '-as', action="store", dest="assembler", nargs='?',
+                        help="Assembler tools to be used (StringTie|Scallop)", default="StringTie")
+    parser.add_argument('--assembler-extra-args', '-ag', action="store", dest="assembler_extra_args", nargs='?',
+                        help="Extra arguments to be passed to the assembler tool", default="")
+    parser.add_argument('--assembler-use-reference', '-aur', action="store_true", dest="assembler_use_reference",
+                        help="Use annotation in initial transcript assembly - only applicable to StringTie")
+    parser.add_argument('--assembler-merge-extra-args', '-am', action="store", dest="assembler_merge_extra_args",
+                        nargs='?', help="Extra arguments to be passed to the assembler merge tool", default="")
+    parser.add_argument('--region', '-r', action="store", dest="aws_region", help="AWS region")
+
+    parser_result = parser.parse_args()
+
+    split_num = 0
+
+    conf = SparkConf().setAppName("Spark-based RNA-seq Pipeline for Read Assembly")
+    sc = SparkContext(conf=conf)
+
+    if parser_result.input_dir.startswith("s3://"):  # From S3
+
+        s3_client = boto3.client('s3', region_name=parser_result.aws_region)
+        # Get number of input files
+        s3_paginator = s3_client.get_paginator('list_objects')
+        input_bucket, key_prefix = parser_result.input_dir[5:].strip().split("/", 1)
+
+        input_file_num = 0
+
+        for result in s3_paginator.paginate(Bucket=input_bucket, Prefix=key_prefix):
+            for file in result.get("Contents"):
+                input_file_num += 1
+
+        if input_file_num == 0:
+            raise ValueError("Input directory is invalid or empty!")
+
+        split_num = input_file_num
+    else:  # From HDFS
+        hdfs_process = Popen(shlex.split("hdfs dfs -count {}".format(parser_result.input_dir)),
+                             stdout=PIPE, stderr=PIPE)
+        hdfs_out, hdfs_error = hdfs_process.communicate()
+
+        if hdfs_error:
+            raise ValueError("Input directory is invalid or empty!")
+
+        dir_count, file_count, size, path = hdfs_out.strip().split()
+
+        split_num = int(file_count)
+
+    print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Create sam header.")
+    sam_header, chromosome_list = create_sam_header()
+    sam_header_broadcast = sc.broadcast(sam_header)
+    chromosome_list_broadcast = sc.broadcast(chromosome_list)
+    print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Sam header done.")
+
+    input_files = sc.wholeTextFiles(parser_result.input_dir, split_num)
+    read_binned = input_files.flatMap(alignment_bin_step).reduceByKey(add, numPartitions=split_num*2)
+
+    binned_transcripts = read_binned.flatMap(assemble_transcripts).persist()
+    binned_transcripts2 = binned_transcripts.sortByKey().values().collect()
+
+    print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Starting to merge back to reference.")
+    assembled_transcript, merged_transcript = merge_reference_annotation(binned_transcripts2)
+    print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Merging back to reference completed.")
+
+    if parser_result.input_dir.startswith("s3://"):  # From S3
+        output_bucket, key_prefix = parser_result.output_dir.strip().strip("/")[5:].split("/", 1)
+        s3_client.upload_file(assembled_transcript, output_bucket, key_prefix + "/" + assembled_transcript)
+        s3_client.upload_file(merged_transcript, output_bucket, key_prefix + "/" + merged_transcript)
+    else:
+        subprocess.call(["hdfs", "dfs", "-mkdir", "-p", parser_result.output_dir.rstrip("/")])
+        subprocess.call(["hdfs", "dfs", "-put", assembled_transcript, parser_result.output_dir.rstrip("/") + "/"
+                         + assembled_transcript])
+        subprocess.call(["hdfs", "dfs", "-put", merged_transcript, parser_result.output_dir.rstrip("/") + "/"
+                         + merged_transcript])
+
+    if parser_result.enable_analysis:
+        print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Starting analysis of GTF.")
+        analysis_output_dir = TEMP_OUTPUT_FOLDER + "/analysis"
+        try:
+            os.mkdir(analysis_output_dir)
+        except:
+            print('Analysis directory {} already exists.'.format(analysis_output_dir))
+
+        analysis_result = analyse_transcripts(merged_transcript, analysis_output_dir)
+        print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ":" + "Analysis of GTF complete.")
+
+        analysis_result_path = os.path.join(analysis_output_dir, analysis_result)
+        # Rename the file to make it easier
+        analysis_result_key = analysis_result.rsplit(".",1)[0] + ".txt"
+        if parser_result.input_dir.startswith("s3://"):  # From S3
+            s3_client.upload_file(analysis_result_path, output_bucket, key_prefix + "/" + analysis_result_key)
+        else:
+            subprocess.call(["hdfs", "dfs", "-put", analysis_result_path, parser_result.output_dir.rstrip("/") + "/"
+                             + analysis_result_key])
+
+        shutil.rmtree(analysis_output_dir, ignore_errors=True)
+
+    os.remove(assembled_transcript)
+    os.remove(merged_transcript)
diff --git a/source/spark_runner/run_pipeline_multiple_files.py b/source/spark_runner/run_pipeline_multiple_files.py
index 27324e0..b5b68a2 100644
--- a/source/spark_runner/run_pipeline_multiple_files.py
+++ b/source/spark_runner/run_pipeline_multiple_files.py
@@ -2,8 +2,10 @@
 import os
 import shlex
 import shutil
+import sys
 from subprocess import Popen, PIPE
 from pyspark import SparkContext, SparkConf
+import pyspark.serializers
 import pandas as pd
 import subprocess
 import boto3
@@ -11,7 +13,10 @@
 
 global parser_result
 
-APPLICATION_FOLDER = "/mnt/app"
+if sys.version > "3.4":
+    pyspark.serializers.protocol = 4
+
+APPLICATION_FOLDER = "/app"
 GENOME_REFERENCES_FOLDER = "/mnt/ref"
 TEMP_OUTPUT_FOLDER = "/mnt/output"
 
@@ -106,13 +111,13 @@ def align_reads_star(sample_name, file_names, alignment_output_dir):
 
     if aligner_process.returncode != 0:
         raise ValueError("STAR failed to complete (Non-zero return code)!\n"
-                         "STAR stdout: {std_out} \nSTAR stderr: {std_err}".format(std_out=aligner_out,
-                                                                                  std_err=aligner_error))
+                         "STAR stdout: {std_out} \nSTAR stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                  std_err=aligner_error.decode("utf8")))
 
-    if aligner_error.strip() != "" or not os.path.isfile(alignment_output_dir + "/Log.final.out"):
+    if aligner_error.decode("utf8").strip() != "" or not os.path.isfile(alignment_output_dir + "/Log.final.out"):
         raise ValueError("STAR failed to complete (No output file is found)!\n"
-                         "STAR stdout: {std_out} \nSTAR stderr: {std_err}".format(std_out=aligner_out,
-                                                                                  std_err=aligner_error))
+                         "STAR stdout: {std_out} \nSTAR stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                  std_err=aligner_error.decode("utf8")))
 
     print('Completed reads alignment')
 
@@ -140,7 +145,7 @@ def align_reads_hisat(sample_name, file_names, alignment_output_dir):
     if paired_read:
         fastq_file_args = "-1 {} -2 {}".format(*file_names)
     else:
-        fastq_file_args = "-U {}".format()
+        fastq_file_args = "-U {}".format(*file_names)
 
     aligner_args = "{app_folder}/hisat/hisat2 -p 4 --tmo {aligner_extra_args} -x {index_folder}/hisat2.index " \
                    "{fastq_file_names} -S {output_folder}/output.sam".\
@@ -155,12 +160,12 @@ def align_reads_hisat(sample_name, file_names, alignment_output_dir):
 
     if aligner_process.returncode != 0:
         raise ValueError("HISAT2 failed to complete (Non-zero return code)!\n"
-                         "HISAT2 stdout: {std_out} \nHISAT2 stderr: {std_err}".format(std_out=aligner_out,
-                                                                                      std_err=aligner_error))
+                         "HISAT2 stdout: {std_out} \nHISAT2 stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                      std_err=aligner_error.decode("utf8")))
     print('Completed reads alignment')
 
     aligner_qc_output = []
-    for line in aligner_error.split("\n"):
+    for line in aligner_error.decode("utf8").split("\n"):
         line = line.strip()
 
         # Check if line only give percentage info (we will ignore this line)
@@ -179,6 +184,39 @@ def align_reads_hisat(sample_name, file_names, alignment_output_dir):
 
     return sam_file_name_output, aligner_qc_output
 
+
+def align_reads_subread(sample_name, file_names, alignment_output_dir):
+    # If paired read flag is required
+    paired_read = True if len(file_names) == 2 else False
+
+    print("Aligning reads...")
+    if paired_read:
+        fastq_file_args = "-r {} -R {}".format(*file_names)
+    else:
+        fastq_file_args = "-r {}".format(*file_names)
+
+    aligner_args = "{app_folder}/subread/subread-align -T 4 -t 0 --SAMoutput -u {aligner_extra_args} " \
+                   "-i {index_folder}/genome {fastq_file_names} -o {output_folder}/output.sam".\
+        format(app_folder=APPLICATION_FOLDER,
+               aligner_extra_args="" if parser_result.aligner_extra_args is None else parser_result.aligner_extra_args,
+               index_folder=GENOME_REFERENCES_FOLDER + "/subread_index",
+               fastq_file_names=fastq_file_args,
+               output_folder=alignment_output_dir)
+    print("Command: " + aligner_args)
+    aligner_process = Popen(shlex.split(aligner_args), stdout=PIPE, stderr=PIPE)
+    aligner_out, aligner_error = aligner_process.communicate()
+
+    if aligner_process.returncode != 0:
+        raise ValueError("Subread failed to complete (Non-zero return code)!\n"
+                         "Subread stdout: {std_out} \nSubread stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                        std_err=aligner_error.decode("utf8")))
+    print('Completed reads alignment')
+
+    aligner_qc_output = []
+    sam_file_name_output = "output.sam"
+
+    return sam_file_name_output, aligner_qc_output
+
 #################################
 #  Counter
 #################################
@@ -203,7 +241,7 @@ def count_reads_featurecount(sample_name, aligned_output_filepath, paired_reads,
         raise ValueError("featureCount failed to complete! (Non-zero return code)\nCounter stdout: {} \n"
                          "Counter stderr: {}".format(counter_out, counter_error))
 
-    if "[Errno" in counter_error.strip() or "error" in counter_error.strip().lower():
+    if "[Errno" in counter_error.decode("utf8").strip() or "error" in counter_error.decode("utf8").strip().lower():
         raise ValueError("featureCount failed to complete! (Error)\nCounter stdout: {} \nCounter stderr: {}".
                          format(counter_out, counter_error))
 
@@ -252,13 +290,13 @@ def count_reads_htseq(sample_name, aligned_output_filepath, paired_reads, counte
         raise ValueError("HTSeq failed to complete! (Non-zero return code)\nCounter stdout: {} \nCounter stderr: {}".
                          format(counter_out, counter_error))
 
-    if "[Errno" in counter_error.strip():
+    if "[Errno" in counter_error.decode("utf8").strip():
         raise ValueError("HTSeq failed to complete! (Error)\nCounter stdout: {} \nCounter stderr: {}".
                          format(counter_out, counter_error))
 
     counter_output = []
     counter_qc_output = []
-    for gene_count in counter_out.strip().split("\n"):
+    for gene_count in counter_out.decode("utf8").strip().split("\n"):
         if len(gene_count.strip().split()) == 0:
             print(gene_count)
         gene, count = gene_count.strip().split()
@@ -270,6 +308,71 @@ def count_reads_htseq(sample_name, aligned_output_filepath, paired_reads, counte
 
     return counter_output, counter_qc_output
 
+#################################
+#  Counting without alignment output
+#################################
+
+
+def count_reads_star(sample_name, file_names, alignment_output_dir):
+    # If paired read flag is required
+    # paired_read = True if len(file_names) == 2 else False
+
+    print("Counting reads...")
+    aligner_args = "{app_folder}/STAR/STAR --runThreadN 4 {aligner_extra_args} --genomeDir {index_folder} " \
+                   "--readFilesIn {fastq_file_names} --outFileNamePrefix {output_folder} --quantMode GeneCounts".\
+        format(app_folder=APPLICATION_FOLDER,
+               aligner_extra_args="" if parser_result.aligner_extra_args is None else parser_result.aligner_extra_args,
+               index_folder=GENOME_REFERENCES_FOLDER + "/star_index",
+               fastq_file_names=" ".join(file_names),
+               output_folder=alignment_output_dir + "/")
+    print("Command: " + aligner_args)
+    aligner_process = Popen(shlex.split(aligner_args), stdout=PIPE, stderr=PIPE)
+    aligner_out, aligner_error = aligner_process.communicate()
+
+    if aligner_process.returncode != 0:
+        raise ValueError("STAR counting failed to complete (Non-zero return code)!\n"
+                         "STAR stdout: {std_out} \nSTAR stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                  std_err=aligner_error.decode("utf8")))
+
+    if aligner_error.decode("utf8").strip() != "" or not os.path.isfile(alignment_output_dir + "/Log.final.out"):
+        raise ValueError("STAR counting failed to complete (No output file is found)!\n"
+                         "STAR stdout: {std_out} \nSTAR stderr: {std_err}".format(std_out=aligner_out.decode("utf8"),
+                                                                                  std_err=aligner_error.decode("utf8")))
+
+    print('Completed read counts')
+
+    qc_output = []
+    with open(alignment_output_dir + "/Log.final.out") as aligner_qc:
+        for line in aligner_qc:
+            parts = line.strip().split("\t")
+            if len(parts) < 2:
+                continue
+            aligner_metric_name, aligner_metric_value = parts[0].strip("| "), parts[1].strip()
+            if aligner_metric_name.lower() in star_collected_metrics:
+                qc_output.append((sample_name + "\t" + "QC_STAR_" + aligner_metric_name.replace(" ", "_"),
+                                  int(aligner_metric_value)))
+
+    counter_output = []
+    with open(alignment_output_dir + "/ReadsPerGene.out.tab") as f:
+        # STAR produces 3 counts for unstranded, first strand and second strand
+        count_index = 1
+        if parser_result.strand_specificity == "FIRST_READ_TRANSCRIPTION_STRAND":
+            count_index = 2
+        elif parser_result.strand_specificity == "SECOND_READ_TRANSCRIPTION_STRAND":
+            count_index = 3
+
+        for index, line in enumerate(f):
+            line = line.strip().split()
+
+            if len(line) == 0:
+                print(line)
+
+            if line[0].startswith("N_"):  # QC output
+                qc_output.append((sample_name + "\t" + "QC_STAR-count_" + line[0].lstrip("N_"), int(line[count_index])))
+            else:
+                counter_output.append((sample_name + "\t" + line[0], int(line[count_index])))
+
+    return counter_output, qc_output
 
 #################################
 #  Picard tools
@@ -288,7 +391,8 @@ def run_picard(sample_name, aligned_output_filepath, picard_output_dir):
 
     if not os.path.isfile(picard_output_dir + "/output.RNA_Metrics"):
         raise ValueError("Picard tools failed to complete (No output file is found)!\n"
-                         "Picard tools stdout: {} \nPicard tools stderr: {}".format(picard_out, picard_error))
+                         "Picard tools stdout: {} \nPicard tools stderr: {}".format(picard_out.decode("utf8"),
+                                                                                    picard_error.decode("utf8")))
 
     picard_qc_output = []
     with open(picard_output_dir + "/output.RNA_Metrics") as picard_qc:
@@ -352,8 +456,15 @@ def alignment_count_step(keyval):
     # Output: [sample_name\tgene, count] as [key,val]
     global parser_result, star_collected_metrics, picard_collected_metrics
 
+    prefix_regex = r"(.*_part[0-9]*)\."
+
     file_name, file_content = keyval
-    prefix = file_name.rstrip("/").split("/")[-1].split(".")[0]
+    prefix_match = re.findall(prefix_regex, file_name.rstrip("/").split("/")[-1])
+
+    if len(prefix_match) != 1:
+        raise ValueError("Filename can not be resolved (invalid, pattern mismatch): {}".format(file_name))
+
+    prefix = prefix_match[0]
     sample_name = prefix.rsplit("_part", 1)[0]
 
     alignment_dir = TEMP_OUTPUT_FOLDER + "/alignment_" + prefix
@@ -374,31 +485,45 @@ def alignment_count_step(keyval):
     except:
         print('Alignment output directory {} exist.'.format(alignment_output_dir))
 
-    if parser_result.aligner.lower() == "star":
-        aligned_sam_output, aligner_qc_output = align_reads_star(sample_name, split_file_names, alignment_output_dir)
-    elif parser_result.aligner.lower() == "hisat" or parser_result.aligner.lower() == "hisat2":
-        aligned_sam_output, aligner_qc_output = align_reads_hisat(sample_name, split_file_names, alignment_output_dir)
-    else:
-        print("Aligner specified is not yet supported. Defaulting to STAR")
-        aligned_sam_output, aligner_qc_output = align_reads_star(sample_name, split_file_names, alignment_output_dir)
+    aligner_qc_output = []
+    counter_qc_output = []
+    aligned_output_filepath = ""
 
-    aligned_output_filepath = "{}/{}".format(alignment_output_dir.rstrip("/"), aligned_sam_output)
+    if parser_result.aligner.lower() == "star" and parser_result.counter.lower() == "star":
+        counter_output, aligner_qc_output = count_reads_star(sample_name, split_file_names, alignment_output_dir)
 
-    if parser_result.counter.lower() == "featurecount" or parser_result.counter.lower() == "featurecounts":
-        counter_output, counter_qc_output = count_reads_featurecount(sample_name, aligned_output_filepath, paired_reads,
-                                                                     alignment_output_dir)
-    elif parser_result.counter.lower() == "htseq":
-        counter_output, counter_qc_output = count_reads_htseq(sample_name, aligned_output_filepath, paired_reads,
-                                                                  alignment_output_dir)
     else:
-        print("Counter specified is not yet supported. Defaulting to featureCount")
-        counter_output, counter_qc_output = count_reads_featurecount(sample_name, aligned_output_filepath, paired_reads,
+        if parser_result.aligner.lower() == "star":
+            aligned_sam_output, aligner_qc_output = align_reads_star(sample_name, split_file_names,
+                                                                     alignment_output_dir)
+        elif parser_result.aligner.lower() == "hisat" or parser_result.aligner.lower() == "hisat2":
+            aligned_sam_output, aligner_qc_output = align_reads_hisat(sample_name, split_file_names,
+                                                                      alignment_output_dir)
+        elif parser_result.aligner.lower() == "subread":
+            aligned_sam_output, aligner_qc_output = align_reads_subread(sample_name, split_file_names,
+                                                                        alignment_output_dir)
+        else:
+            print("Aligner specified is not yet supported. Defaulting to STAR")
+            aligned_sam_output, aligner_qc_output = align_reads_star(sample_name, split_file_names,
                                                                      alignment_output_dir)
 
+        aligned_output_filepath = "{}/{}".format(alignment_output_dir.rstrip("/"), aligned_sam_output)
+
+        if parser_result.counter.lower() == "featurecount" or parser_result.counter.lower() == "featurecounts":
+            counter_output, counter_qc_output = count_reads_featurecount(sample_name, aligned_output_filepath,
+                                                                         paired_reads, alignment_output_dir)
+        elif parser_result.counter.lower() == "htseq":
+            counter_output, counter_qc_output = count_reads_htseq(sample_name, aligned_output_filepath, paired_reads,
+                                                                      alignment_output_dir)
+        else:
+            print("Counter specified is not yet supported. Defaulting to featureCount")
+            counter_output, counter_qc_output = count_reads_featurecount(sample_name, aligned_output_filepath,
+                                                                         paired_reads, alignment_output_dir)
+
     counter_output.extend(aligner_qc_output)
     counter_output.extend(counter_qc_output)
 
-    if parser_result.run_picard:
+    if parser_result.run_picard and aligned_output_filepath != "":
         picard_qc_output = run_picard(sample_name, aligned_output_filepath, alignment_output_dir)
         counter_output.extend(picard_qc_output)
 
@@ -407,9 +532,10 @@ def alignment_count_step(keyval):
 
 
 if __name__ == "__main__":
+    print("pickle protocol {}".format(pyspark.serializers.protocol))
     global parser_result
 
-    parser = argparse.ArgumentParser(description='Spark-based RNA-seq Pipeline')
+    parser = argparse.ArgumentParser(description='Spark-based RNA-seq Pipeline Quantification')
     parser.add_argument('--input', '-i', action="store", dest="input_dir", help="Input directory - HDFS or S3")
     parser.add_argument('--output', '-o', action="store", dest="output_dir", help="Output directory - HDFS or S3")
     parser.add_argument('--annotation', '-a', action="store", dest="annotation_file",
@@ -419,11 +545,11 @@ def alignment_count_step(keyval):
                         , default="NONE")
     parser.add_argument('--run_picard', '-rp', action="store_true", dest="run_picard", help="Run picard")
     parser.add_argument('--aligner_tools', '-at', action="store", dest="aligner", nargs='?',
-                        help="Aligner to be used (STAR|HISAT2)", default="STAR")
+                        help="Aligner to be used (STAR|HISAT2|Subread)", default="STAR")
     parser.add_argument('--aligner_extra_args', '-s', action="store", dest="aligner_extra_args", nargs='?',
                         help="Extra argument to be passed to alignment tool", default="")
     parser.add_argument('--counter_tools', '-ct', action="store", dest="counter", nargs='?',
-                        help="Counter to be used (featureCount|StringTie)", default="featureCount")
+                        help="Counter to be used (featureCount|StringTie|STAR)", default="featureCount")
     parser.add_argument('--counter_extra_args', '-c', action="store", dest="counter_extra_args", nargs='?',
                         help="Extra argument to be passed to quantification tool", default="")
     parser.add_argument('--picard_extra_args', '-p', action="store", dest="picard_extra_args", nargs='?',
@@ -434,11 +560,10 @@ def alignment_count_step(keyval):
 
     split_num = 0
 
-    conf = SparkConf().setAppName("Spark-based RNA-seq Pipeline Multifile")
+    conf = SparkConf().setAppName("Spark-based RNA-seq Pipeline Quantification")
     sc = SparkContext(conf=conf)
 
     if parser_result.input_dir.startswith("s3://"):  # From S3
-
         s3_client = boto3.client('s3', region_name=parser_result.aws_region)
         # Get number of input files
         s3_paginator = s3_client.get_paginator('list_objects')
diff --git a/submit_alignment_job.py b/submit_alignment_job.py
new file mode 100644
index 0000000..d8ad256
--- /dev/null
+++ b/submit_alignment_job.py
@@ -0,0 +1,173 @@
+import configparser
+import argparse
+import boto3
+import utility
+import sys
+from collections import OrderedDict
+
+global job_configuration, cluster_id, spark_extra_config
+job_configuration = "alignment_job.config"
+cluster_id = ""
+spark_extra_config = [("spark.memory.fraction", "0.5"),
+                      ("spark.memory.storageFraction", "0.3"),
+                      ("spark.python.profile", "true"),
+                      ("spark.python.worker.reuse", "false"),
+                      ("spark.yarn.executor.memoryOverhead", "12288"),
+                      ("spark.driver.maxResultSize", "0"),
+                      ("spark.executor.extraJavaOptions",
+                       "-Dlog4j.configuration=file:///etc/spark/conf/log4j.properties "
+                       "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=30 "
+                       "-XX:MaxHeapFreeRatio=50 -XX:+CMSClassUnloadingEnabled "
+                       "-XX:MaxPermSize=512M -XX:OnOutOfMemoryError='kill -9 %%p'"
+                       "-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/app/oom_dump_`date`.hprof")]
+
+
+def check_configuration(config):
+    if not utility.check_config(config, "job_config", ["name", "action_on_failure", "alignment_script",
+                                                       "alignment_script_s3_location", "upload_alignment_script"]):
+        return False
+
+    if not utility.check_upload_config(config["job_config"], "upload_alignment_script", "alignment_script",
+                                       "alignment_script_local_location", "alignment_script_s3_location"):
+        return False
+
+    if not utility.check_config(config, "spark_config", ["driver_memory", "executor_memory"]):
+        return False
+
+    if not utility.check_config(config, "script_arguments", ["input_location", "output_location",
+                                                             "region", "aligner_tool"]):
+        return False
+
+    if not utility.check_s3_region(config["script_arguments"]["region"]):
+        return False
+
+    return True
+
+
+def calculate_num_executor(cluster_id, executor_memory):
+    global spark_extra_config
+
+    memory_overhead = 512
+    for conf in spark_extra_config:
+        if conf[0] == "spark.yarn.executor.memoryOverhead":
+            memory_overhead = int(conf[1])
+
+    memory_per_executor = int(executor_memory.strip("g")) + memory_overhead/1024
+
+    total_mem, total_cpu = utility.get_cluster_mem_cpu(cluster_id)
+
+    if total_mem < 0 or total_cpu < 0:
+        num_executors = -1  # dry run
+    else:
+        num_executors = int(total_mem/memory_per_executor)
+
+    return num_executors
+
+
+def build_command(config):
+    global cluster_id
+
+    job_arguments = OrderedDict()
+    job_arguments["JobFlowId"] = cluster_id
+
+    step_arguments = OrderedDict()
+    step_arguments['Name'] = config["job_config"]["name"]
+    step_arguments["ActionOnFailure"] = config["job_config"]["action_on_failure"]
+
+    hadoop_arguments = OrderedDict()
+    hadoop_arguments["Jar"] = "command-runner.jar"
+
+    command_args = ["spark-submit",
+                    "--deploy-mode", "cluster"]
+
+    for config_name, config_value in spark_extra_config:
+        command_args.append("--conf")
+        command_args.append("{}={}".format(config_name, config_value))
+
+    for spark_conf in config["spark_config"]:
+        command_args.append("--" + spark_conf.replace("_", "-"))
+        command_args.append(config["spark_config"][spark_conf])
+
+    command_args.append(config["job_config"]["alignment_script_s3_location"].rstrip("/") + "/" +
+                        config["job_config"]["alignment_script"])
+
+    command_args.append("-i")
+    command_args.append(config["script_arguments"]["input_location"])
+    command_args.append("-o")
+    command_args.append(config["script_arguments"]["output_location"])
+
+    command_args.append("-at={}".format(config["script_arguments"]["aligner_tool"]))
+
+    if "aligner_extra_args" in config["script_arguments"] and config["script_arguments"]["aligner_extra_args"].strip() != "":
+        command_args.append('-s={}'.format(config["script_arguments"]["aligner_extra_args"]))
+
+    command_args.append("-r")
+    command_args.append(config["script_arguments"]["region"])
+
+    hadoop_arguments['Args'] = command_args
+    step_arguments["HadoopJarStep"] = hadoop_arguments
+    job_arguments["Steps"] = [step_arguments]
+    return job_arguments
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Job submission script for spark-based RNA-seq Alignment')
+    parser.add_argument('--config', '-c', action="store", dest="job_config", help="Job configuration file")
+    parser.add_argument('--cluster-id', '-id', action="store", dest="cluster_id", help="Cluster ID for submission")
+    parser.add_argument('--dry-run', '-d', action="store_true", dest="dry_run",
+                        help="Produce the configurations for the job flow to be submitted")
+    parser_result = parser.parse_args()
+
+    if parser_result.job_config is not None and parser_result.job_config.strip() != "":
+        job_configuration = parser_result.job_config.strip()
+
+    config = configparser.ConfigParser()
+    config.optionxform = str
+    config.read(job_configuration)
+
+    if parser_result.cluster_id is None or parser_result.cluster_id.strip() == "":
+        cluster_id = utility.get_cluster_id(parser_result.dry_run)
+    else:
+        cluster_id = parser_result.cluster_id.strip()
+
+    if cluster_id != "" and check_configuration(config):
+        if config["job_config"].get("upload_alignment_script", "False") == "True":
+            utility.upload_files_to_s3([(config["job_config"]["alignment_script"],
+                                         config["job_config"]["alignment_script_local_location"],
+                                         config["job_config"]["alignment_script_s3_location"])], parser_result.dry_run)
+
+        num_executors = calculate_num_executor(cluster_id, config["spark_config"]["executor_memory"])
+        if num_executors < 0:
+            config["spark_config"]["num_executors"] = "None"
+        else:
+            config["spark_config"]["num_executors"] = str(num_executors)
+
+        config["spark_config"]["executor_cores"] = "1"
+
+        job_argument = build_command(config)
+
+        if not parser_result.dry_run:
+            emr_client = boto3.client("emr")
+            # warn user before removing any output
+            out = config["script_arguments"]["output_location"]
+            # find out which output dirs, if any, exist
+            dirs_to_remove = utility.check_s3_path_exists([out])
+            # create a list of the names of the directories to remove
+            if dirs_to_remove:
+                response = input("About to remove any existing output directories." +
+                                 "\n\n\t{}\n\nProceed? [y/n]: ".format(
+                                     '\n\n\t'.join(dirs_to_remove)))
+                while response not in ['y', 'n']:
+                    response = input('Proceed? [y/n]: ')
+                if response == 'n':
+                    print("Program Terminated.  Modify config file to change " +
+                          "output directories.")
+                    sys.exit(0)
+                # remove the output directories
+                if not utility.remove_s3_files(dirs_to_remove):
+                    print("Program terminated")
+                    sys.exit(1)
+            job_submission = emr_client.add_job_flow_steps(**job_argument)
+            print("Submitted job to cluster {}. Job id is {}".format(cluster_id, job_submission["StepIds"][0]))
+        else:
+            print(job_argument)
diff --git a/submit_analysis_job.py b/submit_analysis_job.py
index 0a71dee..0e4e4f8 100644
--- a/submit_analysis_job.py
+++ b/submit_analysis_job.py
@@ -12,13 +12,13 @@
                       ("spark.python.profile", "true"),
                       ("spark.python.worker.reuse", "false"),
                       ("spark.yarn.executor.memoryOverhead", "4096"),
-                      ("spark.driver.maxResultSize", "3g"),
+                      ("spark.driver.maxResultSize", "0"),
                       ("spark.executor.extraJavaOptions",
                        "-Dlog4j.configuration=file:///etc/spark/conf/log4j.properties "
                        "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=30 "
                        "-XX:MaxHeapFreeRatio=50 -XX:+CMSClassUnloadingEnabled "
                        "-XX:MaxPermSize=512M -XX:OnOutOfMemoryError='kill -9 %%p'"
-                       " -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/app/oom_dump_`date`.hprof")]
+                       " -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/app/oom_dump_`date`.hprof")]
 
 
 def check_configuration(config):
@@ -107,7 +107,8 @@ def build_command(config):
     if "run_picard" in config["script_arguments"] and config["script_arguments"]["run_picard"].lower() == "true":
         command_args.append("--run_picard")
 
-    if "aligner_extra_args" in config["script_arguments"] and config["script_arguments"]["aligner_extra_args"].strip() != "":
+    if "aligner_extra_args" in config["script_arguments"] and \
+            config["script_arguments"]["aligner_extra_args"].strip() != "":
         command_args.append('-s={}'.format(config["script_arguments"]["aligner_extra_args"]))
 
     if "counter_extra_args" in config["script_arguments"] and \
@@ -127,8 +128,9 @@ def build_command(config):
 
     return job_arguments
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Job submission script for spark-based RNA-seq Pipeline')
+    parser = argparse.ArgumentParser(description='Job submission script for spark-based RNA-seq Quantification')
     parser.add_argument('--config', '-c', action="store", dest="job_config", help="Job configuration file")
     parser.add_argument('--cluster-id', '-id', action="store", dest="cluster_id", help="Cluster ID for submission")
     parser.add_argument('--dry-run', '-d', action="store_true", dest="dry_run",
diff --git a/submit_assembly_job.py b/submit_assembly_job.py
new file mode 100644
index 0000000..00db2ca
--- /dev/null
+++ b/submit_assembly_job.py
@@ -0,0 +1,202 @@
+import configparser
+import argparse
+import boto3
+import utility
+import sys
+from collections import OrderedDict
+
+global job_configuration, cluster_id, spark_extra_config
+job_configuration = "assembly_job.config"
+cluster_id = ""
+spark_extra_config = [("spark.driver.maxResultSize", "0"),
+                      ("spark.memory.fraction", "0.95"),
+                      ("spark.memory.storageFraction", "0.05"),
+                      ("spark.python.worker.reuse", "False"),
+                      ("spark.python.worker.memory", "1024m"),
+                      ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
+                      ("spark.yarn.executor.memoryOverhead", "4096"),
+                      ("spark.executor.extraJavaOptions",
+                           "-Dlog4j.debug=true "
+                           "-Dlog4j.configuration=file:///etc/spark/conf/log4j.properties "
+                           "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=30 "
+                           "-XX:MaxHeapFreeRatio=50 -XX:+CMSClassUnloadingEnabled "
+                           "-XX:MaxPermSize=512M -XX:OnOutOfMemoryError='kill -9 %%p'"
+                           " -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/app/oom_dump_`date`.hprof"
+                      )
+                    ]
+
+
+def check_configuration(config):
+    if not utility.check_config(config, "job_config", ["name", "action_on_failure", "assembly_script",
+                                                       "assembly_script_s3_location", "upload_assembly_script"]):
+        return False
+
+    if not utility.check_upload_config(config["job_config"], "upload_assembly_script", "assembly_script",
+                                       "assembly_script_local_location", "assembly_script_s3_location"):
+        return False
+
+    if not utility.check_config(config, "spark_config", ["driver_memory", "executor_memory"]):
+        return False
+
+    if not utility.check_config(config, "script_arguments", ["input_location", "output_location", "annotation_file",
+                                                             "enable_tiling", "enable_analysis", "region",
+                                                             "aligner_tool", "assembler_tool",
+                                                             "assembler_extra_args", "assembler_merge_extra_args"]):
+        return False
+
+    if not utility.check_s3_region(config["script_arguments"]["region"]):
+        return False
+
+    return True
+
+
+def calculate_num_executor(cluster_id, executor_memory):
+    global spark_extra_config
+
+    memory_overhead = 512
+    for conf in spark_extra_config:
+        if conf[0] == "spark.yarn.executor.memoryOverhead":
+            memory_overhead = int(conf[1])
+
+    memory_per_executor = int(executor_memory.strip("g")) + memory_overhead/1024
+
+    total_mem, total_cpu = utility.get_cluster_mem_cpu(cluster_id)
+
+    if total_mem < 0 or total_cpu < 0:
+        num_executors = -1  # dry run
+    else:
+        num_executors = int(total_mem/memory_per_executor)
+
+    return num_executors
+
+
+def build_command(config):
+    global cluster_id
+
+    job_arguments = OrderedDict()
+    job_arguments["JobFlowId"] = cluster_id
+
+    step_arguments = OrderedDict()
+    step_arguments['Name'] = config["job_config"]["name"]
+    step_arguments["ActionOnFailure"] = config["job_config"]["action_on_failure"]
+
+    hadoop_arguments = OrderedDict()
+    hadoop_arguments["Jar"] = "command-runner.jar"
+
+    command_args = ["spark-submit",
+                    "--deploy-mode", "cluster"]
+
+    for config_name, config_value in spark_extra_config:
+        command_args.append("--conf")
+        command_args.append("{}={}".format(config_name, config_value))
+
+    for spark_conf in config["spark_config"]:
+        command_args.append("--" + spark_conf.replace("_", "-"))
+        command_args.append(config["spark_config"][spark_conf])
+
+    command_args.append(config["job_config"]["assembly_script_s3_location"].rstrip("/") + "/" +
+                        config["job_config"]["assembly_script"])
+
+    command_args.append("-i")
+    command_args.append(config["script_arguments"]["input_location"])
+    command_args.append("-o")
+    command_args.append(config["script_arguments"]["output_location"])
+    command_args.append("-a={}".format(config["script_arguments"]["annotation_file"]))
+
+    command_args.append("-at={}".format(config["script_arguments"]["aligner_tool"]))
+    command_args.append("-as={}".format(config["script_arguments"]["assembler_tool"]))
+
+    if "aligner_extra_args" in config["script_arguments"] and \
+            config["script_arguments"]["aligner_extra_args"].strip() != "":
+        command_args.append('-s={}'.format(config["script_arguments"]["aligner_extra_args"]))
+
+    if "assembler_extra_args" in config["script_arguments"] and \
+            config["script_arguments"]["assembler_extra_args"].strip() != "":
+        command_args.append("-ag={}".format(config["script_arguments"]["assembler_extra_args"]))
+
+    if "assembler_merge_extra_args" in config["script_arguments"] and \
+            config["script_arguments"]["assembler_merge_extra_args"].strip() != "":
+        command_args.append("-am={}".format(config["script_arguments"]["assembler_merge_extra_args"]))
+
+    if "assembler_use_reference" in config["script_arguments"] and \
+            config["script_arguments"]["assembler_use_reference"].lower() == "true":
+        command_args.append("-aur")
+
+    if "enable_tiling" in config["script_arguments"] and config["script_arguments"]["enable_tiling"].lower() == "true":
+        command_args.append("-et")
+
+    if "enable_analysis" in config["script_arguments"] and \
+            config["script_arguments"]["enable_analysis"].lower() == "true":
+        command_args.append("-ea")
+
+    command_args.append("-r")
+    command_args.append(config["script_arguments"]["region"])
+
+    hadoop_arguments['Args'] = command_args
+    step_arguments["HadoopJarStep"] = hadoop_arguments
+    job_arguments["Steps"] = [step_arguments]
+
+    return job_arguments
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Job submission script for spark-based RNA-seq Transcript Assembly')
+    parser.add_argument('--config', '-c', action="store", dest="job_config", help="Job configuration file")
+    parser.add_argument('--cluster-id', '-id', action="store", dest="cluster_id", help="Cluster ID for submission")
+    parser.add_argument('--dry-run', '-d', action="store_true", dest="dry_run",
+                        help="Produce the configurations for the job flow to be submitted")
+    parser_result = parser.parse_args()
+
+    if parser_result.job_config is not None and parser_result.job_config.strip() != "":
+        job_configuration = parser_result.job_config.strip()
+
+    config = configparser.ConfigParser()
+    config.optionxform = str
+    config.read(job_configuration)
+
+    if parser_result.cluster_id is None or parser_result.cluster_id.strip() == "":
+        cluster_id = utility.get_cluster_id(parser_result.dry_run)
+    else:
+        cluster_id = parser_result.cluster_id.strip()
+
+    if cluster_id != "" and check_configuration(config):
+        if config["job_config"].get("upload_assembly_script", "False") == "True":
+            utility.upload_files_to_s3([(config["job_config"]["assembly_script"],
+                                         config["job_config"]["assembly_script_local_location"],
+                                         config["job_config"]["assembly_script_s3_location"])], parser_result.dry_run)
+
+        num_executors = calculate_num_executor(cluster_id, config["spark_config"]["executor_memory"])
+        if num_executors < 0:
+            config["spark_config"]["num_executors"] = "None"
+        else:
+            config["spark_config"]["num_executors"] = str(num_executors)
+
+        config["spark_config"]["executor_cores"] = "1"
+
+        job_argument = build_command(config)
+
+        if not parser_result.dry_run:
+            emr_client = boto3.client("emr")
+            # warn user before removing any output
+            out = config["script_arguments"]["output_location"]
+            # find out which output dirs, if any, exist
+            dirs_to_remove = utility.check_s3_path_exists([out])
+            # create a list of the names of the directories to remove
+            if dirs_to_remove:
+                response = input("About to remove any existing output directories." +
+                                 "\n\n\t{}\n\nProceed? [y/n]: ".format(
+                                     '\n\n\t'.join(dirs_to_remove)))
+                while response not in ['y', 'n']:
+                    response = input('Proceed? [y/n]: ')
+                if response == 'n':
+                    print("Program Terminated.  Modify config file to change " +
+                          "output directories.")
+                    sys.exit(0)
+                # remove the output directories
+                if not utility.remove_s3_files(dirs_to_remove):
+                    print("Program terminated")
+                    sys.exit(1)
+            job_submission = emr_client.add_job_flow_steps(**job_argument)
+            print("Submitted job to cluster {}. Job id is {}".format(cluster_id, job_submission["StepIds"][0]))
+        else:
+            print(job_argument)