Skip to content

Commit

Permalink
Merge pull request #292 from sanger-tol/bedgraph_fix
Browse files Browse the repository at this point in the history
selfcomp fix and telomere output fix
  • Loading branch information
DLBPointon authored Apr 4, 2024
2 parents 7f69726 + a235edf commit bc6c484
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 36 deletions.
10 changes: 6 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,17 @@ This builds on the initial release by adding subworkflows which generate kmer ba
- Updated the minimap2 align module to remove samtools view in preference of paftools for our usecase.
- Updated the test.yml inline with the above changes.
- Updated the SELFCOMP subworkflow to allow for the parallelisation of the work on large genomes.
- Updated the READ_COVERAGE subworkflow to produce the AVG coverage and STND coverage
- Updated the READ_COVERAGE subworkflow to produce the scaffold based AVG coverage and STND coverage
- Updated Modules from NF-Core - mostly relates to module structure rather than software.
- Updated the SummaryStats output to include HiC container counts.
- Added -T / -t flags where possible to minimise the use of the /tmp directory.
- Replaced CONCAT_MUMMER with CATCAT for simplicity.
- Removed JUICER from the RAPID entrypoint.
- Removed the csi or tbi logic. CSI is now used by default, this simplified the workflow and moved the logic block previously required.
- Added NF-DOWNLOAD to the CI-CD due to an error which causes incomplete downloaded when downloading a number of images at the same time.
- Added the RAPID_TOL entry point which is more geared towards requirements of Sanger.
- Removed the csi or tbi logic. CSI is now used by default, this simplified the workflow and enlarges the capacity to handle much larger genomes. The logic block previously required was then moved.
- Added NF-DOWNLOAD to the CI-CD due to an error that causes incomplete downloaded when downloading a number of images at the same time.
- Added the RAPID_TOL entry point which is more geared towards the requirements of Sanger.
- Fix a bug in build_alignment_blocks.py to avoid indexing errors happening in large genomes.
- Change output BEDGRAPH from EXTRACT_TELO module.

### Parameters

Expand Down
52 changes: 22 additions & 30 deletions bin/build_alignment_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,43 +40,38 @@ def build_block(mylist):
qlist = []
nlist = []

for idx, x in enumerate(mylist):
idx = 0
while idx < len(mylist):
x = mylist[idx]
if idx < len(mylist) - 1:
qcurrent = int(x[6])
rcurrent = int(x[1])
qnext = mylist[idx + 1][6]
leftover = mylist[idx : len(mylist)]

# leftd = int(max((x[6]-qcurrent for x in leftover), default=0))

leftd = list(x[6] - qmin for x in leftover)

positives = [x for x in leftd if x > 0]

min_value = min((positives), default=0)

indmin = leftd.index(min_value)

rm = leftover[indmin][1]

if qcurrent > qmin and qcurrent < qnext and rm == rcurrent:
qmin = qcurrent
qlist.append(idx)

if qcurrent > qmin and qcurrent < qnext and rm > rcurrent:
nlist.append(idx)

if qcurrent > qmin and qcurrent > qnext:
nlist.append(idx)

if qcurrent < qmin and qcurrent > qnext:
nlist.append(idx)
leftd = [int(y[6]) - qmin for y in mylist[idx:]]
positives = list(filter(lambda x: x > 0, leftd))

if positives:
min_value = min(positives)
indmin = leftd.index(min_value)
rm = mylist[idx + indmin][1]

if qcurrent > qmin and qcurrent < qnext and rm == rcurrent:
qmin = qcurrent
qlist.append(idx)
elif qcurrent > qmin and qcurrent < qnext and rm > rcurrent:
nlist.append(idx)
elif qcurrent > qmin and qcurrent > qnext or qcurrent < qmin and qcurrent > qnext:
nlist.append(idx)
else:
idx += 1
continue

if idx == len(mylist) - 1:
if mylist[idx][6] > qmin:
qlist.append(idx)
else:
nlist.append(idx)
idx += 1

alignment_chain = [mylist[i] for i in qlist]
new_list = [mylist[i] for i in nlist]
Expand Down Expand Up @@ -122,12 +117,9 @@ def build_block(mylist):
while newlist:
blocks, newlist = build_block(newlist)

# fileprefix = "".join(random.choices(string.ascii_lowercase + string.digits, k=12))
# filename = fileprefix + ".block"
newblocks = [
[x if i != 3 else y[3] + ":" + str(y[6]) + ":" + str(y[7]) for i, x in enumerate(y)] for y in blocks
]

a = pybedtools.BedTool(newblocks)
merged = a.merge(d=100000, c="4,7,8", o="collapse,min,max", delim="|")
fo.write(str(merged))
Expand Down
2 changes: 1 addition & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ process {
withName: '.*:.*:READ_COVERAGE:MINIMAP2_ALIGN' {
cpus = { check_max( 20 * 1, 'cpus' ) }
memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
time = { check_max( 1.h * ( reference.size() < 1e9 ? 10 : reference.size() < 3.5e9 ? 30 : 60), 'time' ) }
time = { check_max( 1.h * ( reference.size() < 1e9 ? 10 : reference.size() < 10e9 ? 30 : 60), 'time' ) }
}

withName: '.*:.*:READ_COVERAGE:BEDTOOLS_GENOMECOV' {
Expand Down
2 changes: 1 addition & 1 deletion modules/local/extract_telo.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ process EXTRACT_TELO {
def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
$/
cat "${file}" |awk '{print $2"\t"$4"\t"$5}'|sed 's/>//g' > ${prefix}_telomere.bed
cat "${file}" |awk '{print $2"\t"$4"\t"$5"\t"$6}'|sed 's/>//g' > ${prefix}_telomere.bedgraph
cat "${file}" |awk '{print $2"\t"$4"\t"$5"\t"((($5-$4)<0)?-($5-$4):($5-$4))}' | sed 's/>//g' > ${prefix}_telomere.bedgraph

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down

0 comments on commit bc6c484

Please sign in to comment.