diff --git a/EMBLmyGFF3/EMBLmyGFF3.py b/EMBLmyGFF3/EMBLmyGFF3.py index 8bb0dc1..18e5870 100755 --- a/EMBLmyGFF3/EMBLmyGFF3.py +++ b/EMBLmyGFF3/EMBLmyGFF3.py @@ -9,7 +9,7 @@ shameless_plug=""" ############################################################################# - # NBIS 2018 - Sweden # + # NBIS 2019 - Sweden # # Authors: Martin Norling, Niclas Jareborg, Jacques Dainat # # Please visit https://github.com/NBISweden/EMBLmyGFF3 for more information # ############################################################################# @@ -188,12 +188,12 @@ def _add_mandatory(self): try: start = seq.index('n') while start: - logging.debug("There is gap starting at position %s", start) + logging.debug("There is gap in %s starting at position %s" % (self.record.name,start)) # Now find the end end = start + 1 - while end: + while end < len(seq): if seq[end] == 'n' : - end +=1 + end +=1 else: break @@ -693,7 +693,8 @@ def FT(self): break if not locus_tag: #inform the user that we will use the locus_tag instead msg_type = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist" % (attribute) - msg = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist for feature %s. Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag) + msg = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist for feature %s. "\ + "Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag) self.handle_message("warning", msg_type, msg, None) # create a locus tag base on the prefix + LOCUS + incremented number if not locus_tag: @@ -854,7 +855,10 @@ def set_classification(self, classification = None, strain = None, environmental if not strain and not environmental_sample and not isolate: #no information provided, let's ask the user onekey = None while not onekey: - sys.stderr.write("At least one of the following qualifiers \"strain, environmental_sample, isolate\" must exist when organism belongs to Bacteria. Please fill one of those information.(source feature keys containing the /environmental_sample qualifier should also contain the /isolation_source qualifier. entries including /environmental_sample must not include the /strain qualifier)\nStrain:") + sys.stderr.write("At least one of the following qualifiers \"strain, environmental_sample, isolate\" must exist " \ + "when organism belongs to Bacteria. Please fill one of those information.(source feature keys containing "\ + "the /environmental_sample qualifier should also contain the /isolation_source qualifier. entries including "\ + "/environmental_sample must not include the /strain qualifier)\nStrain:") strain = raw_input() if strain: EMBL.PREVIOUS_VALUES["strain"]=strain @@ -1340,6 +1344,14 @@ def main(): infile.seek(0, 0) for record in GFF.parse(infile, base_dict=seq_dict): + + # Check existence of gff seqid among the fasta sequence identifiers + if record.id not in seq_dict: + logging.warning("Sequence id <%s> from the gff file not found within the fasta file. Are you sure to provide the correct" \ + " fasta file? The tool will create a string of ???? as sequence (its length will be the end position of the last feature). " \ + "For you information, if you use the --translate option the tool will raise an error due to ??? codons that do not exist." % (record.id)) + + # Check sequence size and skip if < 100 bp if len(record.seq)<100: logging.warning("Sequence %s too short (%s bp)! Minimum accpeted by ENA is 100, we skip it !" % (record.name, len(record.seq) ) ) continue @@ -1383,6 +1395,7 @@ def main(): writer.write_all( outfile ) writer = None - EMBL.print_progress(True) + if args.progress: + EMBL.print_progress(True) sys.stderr.write( """Conversion done\n""") diff --git a/EMBLmyGFF3/modules/feature.py b/EMBLmyGFF3/modules/feature.py index f4f9f2e..7e5eab4 100755 --- a/EMBLmyGFF3/modules/feature.py +++ b/EMBLmyGFF3/modules/feature.py @@ -251,6 +251,16 @@ def _infer_ORFs(self, feature): # basic info strand = self.location.strand + # raise an error if no strand for the CDS. Strand is not mandatory (can be a dot) except for CDS where it has an + # impact on the translation, and to check where is start and stop codon... + if strand == None: + ID='' + for qualifier in self.feature.qualifiers: + if 'id' == qualifier.lower(): + ID = "%s" % " ".join(self.feature.qualifiers[qualifier]) + break + logging.error('CDS %s does not have any strand! Please check your gff file.' % ID) + sys.exit() if start_codon.upper() not in codon_table.start_codons: self.location = self._set_before(self.location) diff --git a/EMBLmyGFF3/modules/location.py b/EMBLmyGFF3/modules/location.py index a3f63da..6e3dce0 100755 --- a/EMBLmyGFF3/modules/location.py +++ b/EMBLmyGFF3/modules/location.py @@ -19,23 +19,36 @@ def __repr__(self): output = "" suffix = "" - complement = True - if len(self.location.parts) == len([l for l in self.location.parts if l.strand < 0]): + + # Store all strand from the location parts to check potential inconsistency + strand=[] + for l in self.location.parts: + if l.strand != None: + if l.strand not in strand: + strand.append(l.strand) + + if len(strand) == 0: + logging.debug("No strand stored among the location_parts %s" % self.location.parts) + elif len(strand) > 1: + logging.error("Different strand stored in location_parts (+ strand will be used as default): %s" % self.location.parts) + elif strand == [1]: + logging.debug("+ strand") + else: + logging.debug("- strand") output += "complement(" suffix += ")" - complement = False + + # If more than one part let's join the differnt parts together if (len(self.location.parts) > 1): output += "join(" suffix += ")" - output += ",".join(self._format_parts(self.location.parts, complement=complement)) + + output += ",".join(self._format_parts(self.location.parts)) return output + suffix - def _format_parts(self, parts, complement = True): + def _format_parts(self, parts): output = [] for part in parts: - if part.strand > 0 or complement == False: - output += ["%s..%s" % (type(part.start)(part.start+1), type(part.end)(part.end+0))] - else: - output += ["complement(%s..%s)" % (type(part.start)(part.start+1), type(part.end)(part.end+0))] - return output + output += ["%s..%s" % (type(part.start)(part.start+1), type(part.end)(part.end+0))] + return output \ No newline at end of file diff --git a/setup.py b/setup.py index 29f93cc..217798b 100644 --- a/setup.py +++ b/setup.py @@ -4,12 +4,12 @@ setup( name='EMBLmyGFF3', - version='1.2.6', + version='1.2.8', description='An efficient way to convert gff3 annotation files into EMBL format ready to submit', url='https://github.com/NBISweden/EMBLmyGFF3', - download_url='https://github.com/NBISweden/EMBLmyGFF3/archive/v1.2.6.tar.gz', + download_url='https://github.com/NBISweden/EMBLmyGFF3/archive/v1.2.8.tar.gz', author='Martin Norling, Niclas Jareborg, Jacques Dainat', license='GPL-3.0',