Skip to content

Commit

Permalink
Merge pull request #174 from nservant/dev
Browse files Browse the repository at this point in the history
bwa-mem / pairtools
  • Loading branch information
nservant authored Mar 6, 2024
2 parents e2e9832 + 4f2d24a commit a83615a
Show file tree
Hide file tree
Showing 132 changed files with 4,517 additions and 189 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,17 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## dev
## v2.2.0dev

### `Added`

- Add new '--balancing_opts' to update `cooler balance` arguments

- New subworkflow based on `pairtools` to detect valid pairs. The user
can now choose between `--processing hicpro` (default) or `--processing pairtools`

- Default mapping options with `HiC-Pro` has been updated to give closer results in comparison
with `BWA-mem/pairtools`

### `Removed`

Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ On release, automated continuous integration tests run the pipeline on a full-si
2. Detection of valid interaction products
3. Duplicates removal
4. Generate raw and normalized contact maps ([`iced`](https://github.com/hiclib/iced))
2. Generate `pairs` files for downstream analysis
3. [`Pairtools`](https://github.com/open2c/pairtools)
1. Mapping using [`BWA-mem`](https://github.com/lh3/bwa)
4. Detection of valid interaction products with [`pairtools`](https://github.com/open2c/pairtools)
5. Duplicates removal
6. Generate `pairs` files for downstream analysis
3. Create genome-wide contact maps at various resolutions ([`cooler`](https://github.com/open2c/cooler))
4. Contact maps normalization using balancing algorithm ([`cooler`](https://github.com/open2c/cooler))
5. Export to various contact maps formats ([`HiC-Pro`](https://github.com/nservant/HiC-Pro), [`cooler`](https://github.com/open2c/cooler))
Expand Down
13 changes: 8 additions & 5 deletions bin/digest_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def replaceN(cs):
"specified."
),
)
parser.add_argument("--extra", default=False, action='store_true', help="Add extra BED columns")
parser.add_argument("-o", "--out", default=None)
args = parser.parse_args()

Expand Down Expand Up @@ -193,8 +194,10 @@ def replaceN(cs):
# allow to remove cases where the enzyme cut at
# the first position of the chromosome
if end > begin:
frag_id += 1
frag_name = "HIC_{}_{}".format(str(chrom_name), int(frag_id))
outfile.write(
"{}\t{}\t{}\t{}\t0\t+\n".format(str(chrom_name), int(begin), int(end), str(frag_name))
)
if args.extra:
frag_id += 1
frag_name = "HIC_{}_{}".format(str(chrom_name), int(frag_id))
outfile.write("{}\t{}\t{}\t{}\t0\t+\n".format(str(chrom_name), int(begin), int(end), str(frag_name)))
else:
outfile.write("{}\t{}\t{}\n".format(str(chrom_name), int(begin), int(end)))

139 changes: 137 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ process {
]
}

withName: 'BWA_INDEX' {
publishDir = [
path: { "${params.outdir}/genome/bwa" },
mode: 'copy',
enabled: params.save_reference
]
}

withName: 'CUSTOM_GETCHROMSIZES' {
publishDir = [
path: { "${params.outdir}/genome" },
Expand All @@ -39,6 +47,7 @@ process {
mode: 'copy',
enabled: params.save_reference
]
ext.args = params.processing == "hicpro" ? '--extra' : ''
}

//*******************************************
Expand Down Expand Up @@ -133,7 +142,6 @@ process {
[
path: { "${params.outdir}/hicpro/valid_pairs" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: "*Pairs"
]
]
Expand All @@ -143,7 +151,6 @@ process {
withName: 'MERGE_STATS' {
publishDir = [
path: { "${params.outdir}/hicpro/stats/${meta.id}" },
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
mode: 'copy',
pattern: "*stat"
]
Expand Down Expand Up @@ -174,6 +181,133 @@ process {
]
}

//*******************************************
// PAIRTOOLS

withName: 'BWA_MEM' {
publishDir = [
path: { "${params.outdir}/bwa" },
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
mode: 'copy',
enabled: params.save_aligned_intermediates
]
ext.args = params.bwa_opts ?: ''
}

withName: 'PAIRTOOLS_PARSE' {
publishDir = [
path: { "${params.outdir}/pairtools/stats/" },
mode: 'copy',
pattern: "*pairsam.stat"
]
ext.args = { [
'--add-columns mapq',
params.save_interaction_bam ? '' : '--drop-sam --drop-seq',
params.pairtools_parse_opts ?: ''
].join(' ').trim() }
ext.prefix = { "${meta.id}_${meta.chunk}" }
}

withName: 'PAIRTOOLS_RESTRICT' {
publishDir = [
enabled: false
]
ext.prefix = { "${meta.id}_${meta.chunk}_restrict" }
ext.when = !params.dnase
}

withName: 'PAIRTOOLS_SORT' {
publishDir = [
enabled: false
]
ext.prefix = { "${meta.id}_${meta.chunk}_sorted" }
ext.args = { "--tmpdir ./" }
}

withName: 'PAIRTOOLS_MERGE' {
publishDir = [
enabled: false
]
ext.prefix = { "${meta.id}_merged" }
}

withName: 'PAIRTOOLS_SPLIT' {
publishDir = [
path: { "${params.outdir}/pairtools" },
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
mode: 'copy',
pattern: "*pairs.gz",
enabled: params.save_pairs_intermediates
]
ext.args = { params.save_interaction_bam ? "--output-sam ${meta.id}_pairtools.bam" : '' }
}

withName: 'SAMTOOLS_SORT' {
publishDir = [
path: { "${params.outdir}/pairtools" },
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
mode: 'copy',
enabled: params.save_pairs_intermediates
]
ext.prefix = { "${meta.id}_pairtools_sorted" }
}

withName: 'SAMTOOLS_INDEX' {
publishDir = [
path: { "${params.outdir}/pairtools" },
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
mode: 'copy',
enabled: params.save_pairs_intermediates
]
}

withName: 'PAIRTOOLS_DEDUP' {
publishDir = [
path: { "${params.outdir}/pairtools/stats/" },
mode: 'copy',
pattern: "*.pairs.stat"
]
ext.args = { "--mark-dups" }
ext.prefix = { "${meta.id}_dedup" }
ext.when = !params.keep_dups
}

withName: 'PAIRTOOLS_SELECT' {
publishDir = [
path: { "${params.outdir}/pairtools/" },
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
mode: 'copy',
pattern: "*pairs.gz"
]
ext.args = { [
params.min_mapq > 0 ? "(mapq1>${params.min_mapq} and mapq2>${params.min_mapq})" : '',
params.min_cis_dist > 0 ? " and ((chrom1==chrom2 and abs(pos1-pos2) > ${params.min_cis_dist}) or chrom1!=chrom2)" : '',
params.keep_multi ? " and ((pair_type.upper()=='UU') or (pair_type.upper()=='UR') or (pair_type.upper()=='RU') or (pair_type.upper()=='MM') or (pair_type.upper()=='MU'))" : " and ((pair_type.upper()=='UU') or (pair_type.upper()=='UR') or (pair_type.upper()=='RU'))",
params.dnase ? '' : " and ((chrom1==chrom2 and abs(int(rfrag1) - int(rfrag2)) > 1) or chrom1!=chrom2)",
//params.min_insert_size > 0 ? " and ( (rfrag_end1 - r1pos) + (rfrag_end2 - r2pos)) > ${params.min_insert_size}" : '',
//params.max_insert_size > 0 ? " and ( (rfrag_end1 - r1pos) + (rfrag_end2 - r2pos)) < ${params.max_insert_size}" : '',
//params.min_restriction_fragment_size > 0 ? " -t ${params.min_restriction_fragment_size}" : '',
//params.max_restriction_fragment_size > 0 ? " -m ${params.max_restriction_fragment_size}" : '',
].join(' ').trim() }
}

withName: 'PAIRTOOLS_STATS' {
publishDir = [
path: { "${params.outdir}/pairtools/stats/" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
ext.prefix = { "${meta.id}_select.pairs.stat" }
}

withName: 'PAIRIX' {
publishDir = [
path: { "${params.outdir}/pairtools" },
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
mode: 'copy'
]
}

//*****************************************
// QUALITY METRICS

Expand Down Expand Up @@ -215,6 +349,7 @@ process {
mode: 'copy'
]
ext.args = '--force'
ext.args = params.balancing_opts ?: ''
ext.prefix = { "${cool.baseName}_balanced" }
}

Expand Down
10 changes: 6 additions & 4 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@ params {

// Annotations
fasta = 'https://github.com/nf-core/test-datasets/raw/hic/reference/W303_SGD_2015_JRIU00000000.fsa'
//fasta = '/data/annotations/pipelines/Human/hg38/genome/hg38.fa'
//bwa_index = '/data/annotations/pipelines/Human/hg38/indexes/bwamem2/'
digestion = 'hindiii'
min_mapq = 10
min_restriction_fragment_size = 100
max_restriction_fragment_size = 100000
min_insert_size = 100
max_insert_size = 600
// min_restriction_fragment_size = 100
// max_restriction_fragment_size = 100000
// min_insert_size = 100
// max_insert_size = 600

bin_size = '2000,1000'
res_dist_decay = '1000'
Expand Down
120 changes: 120 additions & 0 deletions docs/benchmark.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# nf-core/hic: Output

## Benchmark `HiC-Pro` vs `pairtools`

### Profile test

Comparison of the test profile run with `HiC-Pro` ;

```bash
nextflow run nf-core-hic/main.nf \
-profile singularity,test \
--processing 'hicpro' --save_pairs_intermediates \
--outdir '/tmp/results_test_hicpro/'
```

and `bwa-mem/pairtools` ;

```bash
nextflow run nf-core-hic/main.nf \
-profile singularity,test \
--processing 'pairtools' --save_pairs_intermediates \
--outdir '/tmp/results_test_pairtools/'
```

The idea here was just to have a look at the final list of selected (and unselected)
read pairs classified as valid interactions (or spurious interaction products)

Here is a quick summary statistics ;

```bash
# valid pairs from HiC-Pro = 64191
# valid pairs from pairtools = 79585
# common valid pairs = 59062 (69.72%)
# common filtered pairs = 281201 (87.42%)
----------------------------------------
# Difference in valid pairs = 25652 (30.28%)
# Unmapped with HiC-Pro = 19945 (77.75%)
# Unmapped with Pairtools = 443 (1.73%)
# Filtered by HiC-Pro only = 578 (2.25%)
# Filtered by Pairtools only = 4686 (18.27%)
----------------------------------------
# Difference in filtered pairs = 40478 (12.58%)
# Unmapped with HiC-Pro = 31210 (77.10%)
# Unmapped with Pairtools = 4004 (9.89%)
# Filtered by HiC-Pro only = 578 (1.43%)
# Filtered by Pairtools only = 4686 (11.58%)
```

Overall, we can see that **70%** of valid interactions are called by both `HiC-Pro` and `Pairtools`.
Regarding the 30% of read pairs which are different between the two tools, we can see that a large
majority (>75%) are due to differences in the read mapping (`bowtie2` versus `bwa-mem`).

The few other differences can be at least partly explain by differences in the read pairs selection such as how a read is assigned
to a restriction fragments, etc.

### Full test dataset (SRX2636669)

We then applied the same approach on the full test profile which is based on Mouse Hi-C data from embryonic stem cells
(Bonev et al., 2017 - GSE96107) processed either with `HiC-Pro` ;

```bash
nextflow run nf-core-hic/main.nf \
--input './data/SRX2636669/samplesheet.csv' \
--processing 'hicpro' --save_pairs_intermediates \
--genome 'mm10' \
--digestion 'dpnii' \
--bin_size '40000,250000,500000,1000000' \
--res_compartments '500000,250000' \
--res_tads '40000,20000' \
--outdir './results_SRX2636669_hicpro/' \
-profile singularity
```

or `bwa-mem/pairtools` ;

```bash
nextflow run nf-core-hic/main.nf \
--input './data/SRX2636669/samplesheet.csv' \
--processing 'pairtools' --save_pairs_intermediates \
--genome 'mm10' \
--digestion 'dpnii' \
--bin_size '40000,250000,500000,1000000' \
--res_compartments '500000,250000' \
--res_tads '40000,20000' \
--outdir './results_SRX2636669_pairtools/'
-profile singularity
```

As before, small statistics were computed to compare the list of valid (and not valid) interactions.

```bash
# valid pairs from HiC-Pro = 225982881
# valid pairs from pairtools = 229880874
# common valid pairs = 200026227 (78.18%)
# common filtered pairs = 71619180 (54.43%)
----------------------------------------
# Difference in valid pairs = 55811301 (21.82%)
# Unmapped with HiC-Pro = 21060023 (37.73%)
# Unmapped with Pairtools = 19570337 (35.07%)
# Filtered by HiC-Pro only = 8794624 (15.76%)
# Filtered by Pairtools only = 6386317 (11.44%)
----------------------------------------
# Difference in filtered pairs = 59967305 (45.57%)
# Unmapped with HiC-Pro = 30122043 (50.23%)
# Unmapped with Pairtools = 14664321 (24.45%)
# Filtered by HiC-Pro only = 8794624 (14.67%)
# Filtered by Pairtools only = 6386317 (10.65%)
```

Almost **80%** of valid interactions are called in common by `HiC-Pro` and `pairtools`.
As previously observed, most of the differences observed between the two tools are
explained by distinct mapping procedures.

Finally, we generated the contact maps around a specific regions on the X chromosome
using the `cool` files and the TADs calling generated with both tools.
**No difference is observed at the contact map level.**

![X Inactivation Center - HiC-Pro processing](./images/SRX2636669_hicpro_pygentracks.png)

![X Inactivation Center - Bwa-mem / pairtools](./images/SRX2636669_pairtools_pygentracks.png)
Binary file added docs/images/SRX2636669_hicpro_pygentracks.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 4 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# You can use this file to create a conda environment for this pipeline:
# conda env create -f environment.yml
# conda env create2-f environment.yml
name: nf-core-hic-2.0.0
channels:
- conda-forge
Expand All @@ -16,7 +16,9 @@ dependencies:
- bioconda::pysam=0.19.0=py39h5030a8b_0
- conda-forge::pymdown-extensions=7.1=pyh9f0ad1d_0
- bioconda::cooler=0.8.11=pyh5e36f6f_1
- bioconda::cooltools=0.5.1=py39h5371cbf_1
- bioconda::pairtools=1.0.2
- bioconda::cooltools=0.5.1
- bioconda::bwa-mem2=2.2.1
- bioconda::bowtie2=2.4.5=py39hd2f7db1_2
- bioconda::samtools=1.15.1=h1170115_0
- bioconda::multiqc=1.12=pyhdfd78af_0
Expand Down
Loading

0 comments on commit a83615a

Please sign in to comment.