Skip to content

Commit

Permalink
Change folder structure and add some documentation for utility scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
cguyomar committed Nov 28, 2019
1 parent f88a5e8 commit d9f7b2f
Show file tree
Hide file tree
Showing 33 changed files with 395 additions and 307 deletions.
8 changes: 4 additions & 4 deletions MinYS.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import time

#from MtgMin_utils import MtgParser, ArgumentFormatterMtg, read_mapping_header, ProgressBar
from pipeline_utils import MtgParser, ArgumentFormatterMtg, contig_stats
from minys_utils.minys_utils import MtgParser, ArgumentFormatterMtg, contig_stats


#os.chdir(os.path.split(os.path.realpath(__file__))[0])
Expand Down Expand Up @@ -257,7 +257,7 @@

logger.info("Filtering contigs")
scriptPath = sys.path[0]
filteringCommand = os.path.join(scriptPath,"filter_contigs.py")
filteringCommand = os.path.join(scriptPath,"minys_utils/filter_contigs.py")
filteringCommand += " " + args.min_contig_size

filteredFile = assemblyPrefix + "_filtered_"+args.min_contig_size+".fa"
Expand Down Expand Up @@ -370,7 +370,7 @@


pipelineDir = os.path.abspath(os.path.dirname(sys.argv[0]))
simplificationCommand = [os.path.join(pipelineDir,"genome_graph/graph_simplification.py")]
simplificationCommand = [os.path.join(pipelineDir,"graph_simplification/graph_simplification.py")]
if os.path.isfile(simplificationCommand[0])==False:
logger.error("Script not found : "+simplificationCommand)

Expand All @@ -390,7 +390,7 @@
p.wait()

simplificationTime = time.time()
simplificationDuration = round(simplifiactionTime - gapfillingTime,1)
simplificationDuration = round(simplificationTime - gapfillingTime,1)


logger.info("Runtime :")
Expand Down
26 changes: 19 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@



MinYS allows targeted assembly of bacterial genomes using a reference-guided pipeline. It consists in 3 steps :
MinYS allows targeted assembly of bacterial genomes using a reference-guided pipeline. It consists in 3 steps :

- Mapping metagenomic reads on a reference genome using BWA. And assembling the recruited reads using [Minia](https://github.com/GATB/minia).
- Gapfilling the contig set using [MindTheGap](https://github.com/GATB/MindTheGap) in *contig mode*.
Expand All @@ -19,17 +19,17 @@ MinYS was developed in [GenScale](https://team.inria.fr/genscale/) by :

### Requirements

- [MindTheGap](https://github.com/GATB/MindTheGap)
- [MindTheGap](https://github.com/GATB/MindTheGap)
- [BWA](http://bio-bwa.sourceforge.net/) (read mapping)
- [Minia](https://github.com/GATB/minia) (contig assembly)
- [Bandage](https://github.com/rrwick/Bandage) (Optionnal, for assembly graph visualization)
- [Bandage](https://github.com/rrwick/Bandage) (Optionnal, for assembly graph visualization)

### Installation

```
git clone https://github.com/cguyomar/MinYS
cd MinYS
make -C nwalign/
make -C graph_simplification/nwalign/
./MinYS.py
```

Expand All @@ -49,15 +49,15 @@ make -C nwalign/
[assembly options]:
-minia-bin (1 arg) : path to Minia binary
-assembly-kmer-size (1 arg) : kmer size used for Minia assembly (should be given even if bypassing minia assembly step, usefull knowledge for gap-filling) [Default: 31]
-assembly-abundance-min
-assembly-abundance-min
(1 arg) : Minimal abundance of kmers used for assembly [Default: auto]
-min-contig-size (1 arg) : minimal size for a contig to be used in gapfilling [Default: 0]
[gapfilling options]:
-mtg-dir (1 arg) : path to MindTheGap build directory
-gapfilling-kmer-size
-gapfilling-kmer-size
(1 arg) : kmer size used for gapfilling [Default: 31]
-gapfilling-abundance-min
-gapfilling-abundance-min
(1 arg) : Minimal abundance of kmers used for gapfilling [Default: auto]
-max-nodes (1 arg) : Maximum number of nodes in contig graph [Default: 100]
-max-length (1 arg) : Maximum length of gapfilling (nt) [Default: 10000]
Expand All @@ -75,3 +75,15 @@ make -C nwalign/
In the first case, `-assembly-kmer-size` should be supplied as the overlap between contigs.


### Utility scripts :

Some utility scripts are supplied along with MinYS in order to facilitate the post processing of the gfa graph :

- `graph_simplification/enumerate_paths.py in.gfa out_dir`
Enumerate all the paths in connected components of a graph. Returns paths with a significant difference (ANI < 99\% or alignment coverage <99\%)

- `graph_simplification/filter_components.py in.gfa min_size`
Return a sub-graph containing all the connected components longer than `min_size`

- `graph_simplification/gfa2fasta.py in.gfa out.fa`
Return each sequence of the graph in a multi-fasta file
64 changes: 0 additions & 64 deletions genome_graph/data/simple.gfa

This file was deleted.

64 changes: 0 additions & 64 deletions genome_graph/data/simple2.gfa

This file was deleted.

64 changes: 0 additions & 64 deletions genome_graph/data/simple3.gfa

This file was deleted.

64 changes: 0 additions & 64 deletions genome_graph/data/simple4.gfa

This file was deleted.

Empty file removed genome_graph/tests/__init__.py
Empty file.
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
#!/usr/bin/env python3

"""
Enumerate all paths going through the longest node of a gfa graph
-> Enumerates paths in one connected component only for now
For each connected component of a gfa graph, enumerates all path going through the longest node of the component
Merged similar paths using ani
"""

import argparse

import genome_graph


import shutil
import os
from csv import reader
from progress.bar import Bar
import sys


from genome_graph.genome_graph import GenomeGraph

def write2fasta(seq,seqName,fileName):
ofile = open(os.path.join(fileName), "w")
Expand All @@ -33,7 +34,7 @@ def run_pyani(tempDir):
cov = min(read_pyani_output(os.path.join(tempDir,"ani_out/ANIm_alignment_coverage.tab")))

shutil.rmtree(os.path.join(tempDir,"./ani_out"))

return(identity,cov)

def read_pyani_output(file):
Expand Down Expand Up @@ -116,7 +117,7 @@ def compare_paths(paths,outDir):
print("Found a new path with cov ="+str(maxCov)+" and id="+str(maxId))
nb_unique_paths += 1
shutil.move(os.path.join(tmpDir,seqName+".fa"),os.path.join(outDir,seqName+".fa"))

bar.next()
bar.finish()
print("Number of unique Paths : "+str(nb_unique_paths)+"\n")
Expand All @@ -135,7 +136,7 @@ def compare_paths(paths,outDir):

# Read graph
print("Loading graph")
g = genome_graph.GenomeGraph.read_gfa(opts.infile)
g = GenomeGraph.read_gfa(opts.infile)

# List connected components
comps = g.connected_components()
Expand All @@ -154,7 +155,3 @@ def compare_paths(paths,outDir):
compare_paths(paths,compDir)

compIter += 1




Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

import argparse

import genome_graph
from genome_graph import genome_graph


op = argparse.ArgumentParser()
op.add_argument("infile")
Expand All @@ -15,7 +16,7 @@
opts = op.parse_args()

print("Loading graph")
g = genome_graph.GenomeGraph.read_gfa(opts.infile)
g = genome_graph.GenomeGraph.read_gfa(opts.infile)

for comp in g.connected_components():
#print(comp)
Expand All @@ -27,4 +28,3 @@
g.rem_node(n)

g.write_gfa(opts.outfile)

189 changes: 189 additions & 0 deletions graph_simplification/genome_graph/Lc.schizaphis.gfa

Large diffs are not rendered by default.

File renamed without changes.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
import re
import os
import sys
from utils import reverse_complement,compare_strings, nw_align, locate_nw_binary
from .utils import reverse_complement,compare_strings, nw_align, locate_nw_binary
#from alignment import PairAlign
from paths import Path,setExtend
from .paths import Path,setExtend

nwCommand = locate_nw_binary()

Expand Down Expand Up @@ -369,15 +369,45 @@ def find_all_paths(self,startNode):
p = Path(self,startNode)
paths = {p}
extendedPaths, extended = setExtend(paths,self)
#print(extended)
nbExtension = 1
while extended:
#print(str(len(extendedPaths))+"\n")
nbExtension += 1

paths = extendedPaths.copy()

# Removing terminated non circular paths
#print(str(len(paths)))
paths = {p for p in paths if p.extendable == True}
#print(str(len(paths)))

# There are smarter things to do : we don't need to try to extend the whole set
extendedPaths, extended = setExtend(paths,self)
#for p in extendedPaths:
# print(p.nodeIds)
return(extendedPaths)


def find_all_cyclic_paths(self,startNode):
# Enumerates all possible paths going through a node
p = Path(self,startNode)
paths = {p}
extendedPaths, extended = setExtend(paths,self)
nbExtension = 1
while extended and len(extendedPaths)<200:
print(str(len(extendedPaths))+"\n")
nbExtension += 1

# There are smarter things to do : we don't need to try to extend the whole set
paths = extendedPaths.copy()
extendedPaths, extended = setExtend(paths,self)
for p in extendedPaths:
print(p.nodeIds)
return(extendedPaths)



def BFS(self, n):
res = set()
visited = [False] * (self.maxId+1) # Nodes are 1-indexed
Expand Down
Loading

0 comments on commit d9f7b2f

Please sign in to comment.