Change folder structure and add some documentation for utility scripts

cguyomar · Nov 28, 2019 · d9f7b2f · d9f7b2f
1 parent f88a5e8
commit d9f7b2f
Show file tree

Hide file tree

Showing 33 changed files with 395 additions and 307 deletions.
diff --git a/MinYS.py b/MinYS.py
@@ -20,7 +20,7 @@
 import time
 
 #from MtgMin_utils import MtgParser, ArgumentFormatterMtg, read_mapping_header, ProgressBar
-from pipeline_utils import MtgParser, ArgumentFormatterMtg, contig_stats
+from minys_utils.minys_utils import MtgParser, ArgumentFormatterMtg, contig_stats
 
 
 #os.chdir(os.path.split(os.path.realpath(__file__))[0])
@@ -257,7 +257,7 @@
 
 logger.info("Filtering contigs")
 scriptPath = sys.path[0]
-filteringCommand = os.path.join(scriptPath,"filter_contigs.py")
+filteringCommand = os.path.join(scriptPath,"minys_utils/filter_contigs.py")
 filteringCommand += " " + args.min_contig_size
 
 filteredFile = assemblyPrefix + "_filtered_"+args.min_contig_size+".fa"
@@ -370,7 +370,7 @@
 
 
 pipelineDir = os.path.abspath(os.path.dirname(sys.argv[0]))
-simplificationCommand = [os.path.join(pipelineDir,"genome_graph/graph_simplification.py")]
+simplificationCommand = [os.path.join(pipelineDir,"graph_simplification/graph_simplification.py")]
 if os.path.isfile(simplificationCommand[0])==False:
     logger.error("Script not found : "+simplificationCommand)
 
@@ -390,7 +390,7 @@
 p.wait()
 
 simplificationTime = time.time()
-simplificationDuration = round(simplifiactionTime - gapfillingTime,1)
+simplificationDuration = round(simplificationTime - gapfillingTime,1)
 
 
 logger.info("Runtime :")

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 
 
-MinYS allows targeted assembly of bacterial genomes using a reference-guided pipeline. It consists in 3 steps : 
+MinYS allows targeted assembly of bacterial genomes using a reference-guided pipeline. It consists in 3 steps :
 
 - Mapping metagenomic reads on a reference genome using BWA. And assembling the recruited reads using [Minia](https://github.com/GATB/minia).
 - Gapfilling the contig set using [MindTheGap](https://github.com/GATB/MindTheGap) in *contig mode*.
@@ -19,17 +19,17 @@ MinYS was developed in [GenScale](https://team.inria.fr/genscale/) by :
 
 ### Requirements
 
--  [MindTheGap](https://github.com/GATB/MindTheGap)
+- [MindTheGap](https://github.com/GATB/MindTheGap)
 - [BWA](http://bio-bwa.sourceforge.net/) (read mapping)
 - [Minia](https://github.com/GATB/minia) (contig assembly)
-- [Bandage](https://github.com/rrwick/Bandage) (Optionnal, for assembly graph visualization) 
+- [Bandage](https://github.com/rrwick/Bandage) (Optionnal, for assembly graph visualization)
 
 ### Installation
 
 ```
 git clone https://github.com/cguyomar/MinYS
 cd MinYS
-make -C nwalign/
+make -C graph_simplification/nwalign/
 ./MinYS.py
 ```
 
@@ -49,15 +49,15 @@ make -C nwalign/
 [assembly options]:
   -minia-bin            (1 arg) :    path to Minia binary
   -assembly-kmer-size   (1 arg) :    kmer size used for Minia assembly (should be given even if bypassing minia assembly step, usefull knowledge for gap-filling) [Default: 31]
-  -assembly-abundance-min 
+  -assembly-abundance-min
                         (1 arg) :    Minimal abundance of kmers used for assembly [Default: auto]
   -min-contig-size      (1 arg) :    minimal size for a contig to be used in gapfilling [Default: 0]
 
 [gapfilling options]:
   -mtg-dir              (1 arg) :    path to MindTheGap build directory
-  -gapfilling-kmer-size 
+  -gapfilling-kmer-size
                         (1 arg) :    kmer size used for gapfilling [Default: 31]
-  -gapfilling-abundance-min 
+  -gapfilling-abundance-min
                         (1 arg) :    Minimal abundance of kmers used for gapfilling [Default: auto]
   -max-nodes            (1 arg) :    Maximum number of nodes in contig graph [Default: 100]
   -max-length           (1 arg) :    Maximum length of gapfilling (nt) [Default: 10000]
@@ -75,3 +75,15 @@ make -C nwalign/
   In the first case, `-assembly-kmer-size` should be supplied as the overlap between contigs.
 
 
+### Utility scripts :
+
+Some utility scripts are supplied along with MinYS in order to facilitate the post processing of the gfa graph :
+
+- `graph_simplification/enumerate_paths.py in.gfa out_dir`
+  Enumerate all the paths in connected components of a graph. Returns paths with a significant difference (ANI < 99\% or alignment coverage <99\%)
+
+- `graph_simplification/filter_components.py in.gfa min_size`
+   Return a sub-graph containing all the connected components longer than `min_size`
+
+- `graph_simplification/gfa2fasta.py in.gfa out.fa`
+  Return each sequence of the graph in a multi-fasta file
diff --git a/genome_graph/data/simple.gfa b/genome_graph/data/simple.gfa
diff --git a/genome_graph/data/simple2.gfa b/genome_graph/data/simple2.gfa
diff --git a/genome_graph/data/simple3.gfa b/genome_graph/data/simple3.gfa
diff --git a/genome_graph/data/simple4.gfa b/genome_graph/data/simple4.gfa
diff --git a/genome_graph/tests/__init__.py b/genome_graph/tests/__init__.py
diff --git a/genome_graph/enumerate_paths.py → graph_simplification/enumerate_paths.py b/genome_graph/enumerate_paths.py → graph_simplification/enumerate_paths.py
@@ -1,20 +1,21 @@
 #!/usr/bin/env python3
 
 """
-Enumerate all paths going through the longest node of a gfa graph
--> Enumerates paths in one connected component only for now
+For each connected component of a gfa graph, enumerates all path going through the longest node of the component
+Merged similar paths using ani
 """
 
 import argparse
 
-import genome_graph
+
 
 import shutil
 import os
 from csv import reader
 from progress.bar import Bar
+import sys
 
-
+from genome_graph.genome_graph import GenomeGraph
 
 def write2fasta(seq,seqName,fileName):
         ofile = open(os.path.join(fileName), "w")
@@ -33,7 +34,7 @@ def run_pyani(tempDir):
         cov = min(read_pyani_output(os.path.join(tempDir,"ani_out/ANIm_alignment_coverage.tab")))
 
         shutil.rmtree(os.path.join(tempDir,"./ani_out"))
-        
+
         return(identity,cov)
 
 def read_pyani_output(file):
@@ -116,7 +117,7 @@ def compare_paths(paths,outDir):
                                 print("Found a new path with cov ="+str(maxCov)+" and id="+str(maxId))
                                 nb_unique_paths += 1
                                 shutil.move(os.path.join(tmpDir,seqName+".fa"),os.path.join(outDir,seqName+".fa"))
-                
+
                 bar.next()
         bar.finish()
         print("Number of unique Paths : "+str(nb_unique_paths)+"\n")
@@ -135,7 +136,7 @@ def compare_paths(paths,outDir):
 
 # Read graph
 print("Loading graph")
-g = genome_graph.GenomeGraph.read_gfa(opts.infile) 
+g = GenomeGraph.read_gfa(opts.infile)
 
 # List connected components
 comps = g.connected_components()
@@ -154,7 +155,3 @@ def compare_paths(paths,outDir):
     compare_paths(paths,compDir)
 
     compIter += 1
-
-
-
-
diff --git a/genome_graph/filter_components.py → graph_simplification/filter_components.py b/genome_graph/filter_components.py → graph_simplification/filter_components.py
@@ -6,7 +6,8 @@
 
 import argparse
 
-import genome_graph
+from genome_graph import genome_graph
+
 
 op = argparse.ArgumentParser()
 op.add_argument("infile")
@@ -15,7 +16,7 @@
 opts = op.parse_args()
 
 print("Loading graph")
-g = genome_graph.GenomeGraph.read_gfa(opts.infile) 
+g = genome_graph.GenomeGraph.read_gfa(opts.infile)
 
 for comp in g.connected_components():
     #print(comp)
@@ -27,4 +28,3 @@
             g.rem_node(n)
 
 g.write_gfa(opts.outfile)
-
diff --git a/graph_simplification/genome_graph/Lc.schizaphis.gfa b/graph_simplification/genome_graph/Lc.schizaphis.gfa
diff --git a/genome_graph/SequenceAlignment.py → ...ication/genome_graph/SequenceAlignment.py b/genome_graph/SequenceAlignment.py → ...ication/genome_graph/SequenceAlignment.py
diff --git a/graph_simplification/genome_graph/__pycache__/genome_graph.cpython-36.pyc b/graph_simplification/genome_graph/__pycache__/genome_graph.cpython-36.pyc
diff --git a/graph_simplification/genome_graph/__pycache__/paths.cpython-36.pyc b/graph_simplification/genome_graph/__pycache__/paths.cpython-36.pyc
diff --git a/graph_simplification/genome_graph/__pycache__/pyani.cpython-37.pyc b/graph_simplification/genome_graph/__pycache__/pyani.cpython-37.pyc
diff --git a/graph_simplification/genome_graph/__pycache__/utils.cpython-36.pyc b/graph_simplification/genome_graph/__pycache__/utils.cpython-36.pyc
diff --git a/graph_simplification/genome_graph/__pycache__/utils.cpython-37.pyc b/graph_simplification/genome_graph/__pycache__/utils.cpython-37.pyc
diff --git a/genome_graph/alignment.py → ..._simplification/genome_graph/alignment.py b/genome_graph/alignment.py → ..._simplification/genome_graph/alignment.py
diff --git a/genome_graph/genome_graph.py → ...mplification/genome_graph/genome_graph.py b/genome_graph/genome_graph.py → ...mplification/genome_graph/genome_graph.py
@@ -13,9 +13,9 @@
 import re
 import os
 import sys
-from utils import reverse_complement,compare_strings, nw_align, locate_nw_binary
+from .utils import reverse_complement,compare_strings, nw_align, locate_nw_binary
 #from alignment import PairAlign
-from paths import Path,setExtend
+from .paths import Path,setExtend
 
 nwCommand = locate_nw_binary()
 
@@ -369,15 +369,45 @@ def find_all_paths(self,startNode):
               p = Path(self,startNode)
               paths = {p}
               extendedPaths, extended = setExtend(paths,self)
+              #print(extended)
               nbExtension = 1
               while extended:
+                     #print(str(len(extendedPaths))+"\n")
+                     nbExtension += 1
+
+                     paths = extendedPaths.copy()
+
+                     # Removing terminated non circular paths
+                     #print(str(len(paths)))
+                     paths = {p for p in paths if p.extendable == True}
+                     #print(str(len(paths)))
+
+                     # There are smarter things to do : we don't need to try to extend the whole set
+                     extendedPaths, extended = setExtend(paths,self)
+                     #for p in extendedPaths:
+                     #       print(p.nodeIds)
+              return(extendedPaths)
+
+
+       def find_all_cyclic_paths(self,startNode):
+       # Enumerates all possible paths going through a node
+              p = Path(self,startNode)
+              paths = {p}
+              extendedPaths, extended = setExtend(paths,self)
+              nbExtension = 1
+              while extended and len(extendedPaths)<200:
+                     print(str(len(extendedPaths))+"\n")
                      nbExtension += 1
 
                      # There are smarter things to do : we don't need to try to extend the whole set
                      paths = extendedPaths.copy()
                      extendedPaths, extended = setExtend(paths,self)
+                     for p in extendedPaths:
+                            print(p.nodeIds)
               return(extendedPaths)
 
+
+
        def BFS(self, n):
               res = set()
               visited = [False] * (self.maxId+1) # Nodes are 1-indexed