Merge pull request #45 from NBISweden/1.2.8

update to version 1.2.7 and 1.2.8
NBISweden · Oct 8, 2019 · 789f25f · 789f25f
2 parents be03cf2 + 865dab2
commit 789f25f
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 19 deletions.
diff --git a/EMBLmyGFF3/EMBLmyGFF3.py b/EMBLmyGFF3/EMBLmyGFF3.py
@@ -9,7 +9,7 @@
 
 shameless_plug="""
     #############################################################################
-    # NBIS 2018 - Sweden                                                        #
+    # NBIS 2019 - Sweden                                                        #
     # Authors: Martin Norling, Niclas Jareborg, Jacques Dainat                  #
     # Please visit https://github.com/NBISweden/EMBLmyGFF3 for more information #
     #############################################################################
@@ -188,12 +188,12 @@ def _add_mandatory(self):
         try:
             start = seq.index('n')
             while start:
-                logging.debug("There is gap starting at position %s", start)
+                logging.debug("There is gap in %s starting at position %s" % (self.record.name,start))
                 # Now find the end
                 end = start + 1
-                while end:
+                while end < len(seq):
                     if seq[end] == 'n' :
-                        end +=1  
+                        end +=1
                     else:
                         break
 
@@ -693,7 +693,8 @@ def FT(self):
                             break
                     if not locus_tag: #inform the user that we will use the locus_tag instead
                         msg_type = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist" % (attribute)
-                        msg = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist for feature %s. Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag) 
+                        msg = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist for feature %s. "\
+                        "Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag) 
                         self.handle_message("warning", msg_type, msg, None)
                 # create a locus tag base on the prefix + LOCUS + incremented number
                 if not locus_tag:
@@ -854,7 +855,10 @@ def set_classification(self, classification = None, strain = None, environmental
                 if not strain and not environmental_sample and not isolate: #no information provided, let's ask the user
                     onekey = None
                     while not onekey:
-                        sys.stderr.write("At least one of the following qualifiers \"strain, environmental_sample, isolate\" must exist when organism belongs to Bacteria. Please fill one of those information.(source feature keys containing the /environmental_sample qualifier should also contain the /isolation_source qualifier. entries including /environmental_sample must not include the /strain qualifier)\nStrain:")
+                        sys.stderr.write("At least one of the following qualifiers \"strain, environmental_sample, isolate\" must exist " \
+                                         "when organism belongs to Bacteria. Please fill one of those information.(source feature keys containing "\
+                                         "the /environmental_sample qualifier should also contain the /isolation_source qualifier. entries including "\
+                                         "/environmental_sample must not include the /strain qualifier)\nStrain:")
                         strain = raw_input()
                         if strain: 
                             EMBL.PREVIOUS_VALUES["strain"]=strain
@@ -1340,6 +1344,14 @@ def main():
         infile.seek(0, 0)
 
     for record in GFF.parse(infile, base_dict=seq_dict):
+
+        # Check existence of gff seqid among the fasta sequence identifiers
+        if record.id not in seq_dict:
+            logging.warning("Sequence id <%s> from the gff file not found within the fasta file. Are you sure to provide the correct" \
+                            " fasta file? The tool will create a string of ???? as sequence (its length will be the end position of the last feature). " \
+                            "For you information, if you use the --translate option the tool will raise an error due to ??? codons that do not exist." % (record.id))
+
+        # Check sequence size and skip if < 100 bp
         if len(record.seq)<100:
             logging.warning("Sequence %s too short (%s bp)! Minimum accpeted by ENA is 100, we skip it !" % (record.name, len(record.seq) ) )
             continue
@@ -1383,6 +1395,7 @@ def main():
         writer.write_all( outfile )
 
         writer = None
-    EMBL.print_progress(True)
+    if args.progress:    
+        EMBL.print_progress(True)
 
     sys.stderr.write( """Conversion done\n""")
diff --git a/EMBLmyGFF3/modules/feature.py b/EMBLmyGFF3/modules/feature.py
@@ -251,6 +251,16 @@ def _infer_ORFs(self, feature):
 
             # basic info
             strand = self.location.strand
+            # raise an error if no strand for the CDS. Strand is not mandatory (can be a dot) except for CDS where it has an 
+            # impact on the translation, and to check where is start and stop codon...
+            if strand == None:
+                ID=''
+                for qualifier in self.feature.qualifiers:
+                    if 'id' == qualifier.lower():
+                        ID =  "%s" % " ".join(self.feature.qualifiers[qualifier])
+                        break
+                logging.error('CDS %s does not have any strand! Please check your gff file.'  %  ID) 
+                sys.exit()
 
             if start_codon.upper() not in codon_table.start_codons:
                 self.location = self._set_before(self.location)

diff --git a/EMBLmyGFF3/modules/location.py b/EMBLmyGFF3/modules/location.py
@@ -19,23 +19,36 @@ def __repr__(self):
 
         output = ""
         suffix = ""
-        complement = True
-        if len(self.location.parts) == len([l for l in self.location.parts if l.strand < 0]):
+
+        # Store all strand from the location parts to check potential inconsistency 
+        strand=[]
+        for l in self.location.parts:
+            if l.strand != None:
+                if l.strand not in strand:
+                    strand.append(l.strand)
+
+        if len(strand) == 0:
+            logging.debug("No strand stored among the location_parts %s" % self.location.parts)       
+        elif len(strand) > 1:
+            logging.error("Different strand stored in location_parts (+ strand will be used as default): %s" % self.location.parts)
+        elif strand == [1]:
+            logging.debug("+ strand")
+        else:
+            logging.debug("- strand")
             output += "complement("
             suffix += ")"
-            complement = False
+
+        # If more than one part let's join the differnt parts together
         if (len(self.location.parts) > 1):
             output += "join("
             suffix += ")"
-        output += ",".join(self._format_parts(self.location.parts, complement=complement))
+
+        output += ",".join(self._format_parts(self.location.parts))
 
         return output + suffix
 
-    def _format_parts(self, parts, complement = True):
+    def _format_parts(self, parts):
         output = []
         for part in parts:
-            if part.strand > 0 or complement == False:
-                output += ["%s..%s" % (type(part.start)(part.start+1), type(part.end)(part.end+0))]
-            else:
-                output += ["complement(%s..%s)" % (type(part.start)(part.start+1), type(part.end)(part.end+0))]
-        return output
+            output += ["%s..%s" % (type(part.start)(part.start+1), type(part.end)(part.end+0))]
+        return output
diff --git a/setup.py b/setup.py
@@ -4,12 +4,12 @@
 
 setup(
     name='EMBLmyGFF3',
-    version='1.2.6',
+    version='1.2.8',
 
     description='An efficient way to convert gff3 annotation files into EMBL format ready to submit',
 
     url='https://github.com/NBISweden/EMBLmyGFF3',
-    download_url='https://github.com/NBISweden/EMBLmyGFF3/archive/v1.2.6.tar.gz',
+    download_url='https://github.com/NBISweden/EMBLmyGFF3/archive/v1.2.8.tar.gz',
     author='Martin Norling, Niclas Jareborg, Jacques Dainat',
 
     license='GPL-3.0',