Skip to content

Commit

Permalink
Merge pull request #45 from NBISweden/1.2.8
Browse files Browse the repository at this point in the history
update to version 1.2.7 and 1.2.8
  • Loading branch information
Juke34 authored Oct 8, 2019
2 parents be03cf2 + 865dab2 commit 789f25f
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 19 deletions.
27 changes: 20 additions & 7 deletions EMBLmyGFF3/EMBLmyGFF3.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

shameless_plug="""
#############################################################################
# NBIS 2018 - Sweden #
# NBIS 2019 - Sweden #
# Authors: Martin Norling, Niclas Jareborg, Jacques Dainat #
# Please visit https://github.com/NBISweden/EMBLmyGFF3 for more information #
#############################################################################
Expand Down Expand Up @@ -188,12 +188,12 @@ def _add_mandatory(self):
try:
start = seq.index('n')
while start:
logging.debug("There is gap starting at position %s", start)
logging.debug("There is gap in %s starting at position %s" % (self.record.name,start))
# Now find the end
end = start + 1
while end:
while end < len(seq):
if seq[end] == 'n' :
end +=1
end +=1
else:
break

Expand Down Expand Up @@ -693,7 +693,8 @@ def FT(self):
break
if not locus_tag: #inform the user that we will use the locus_tag instead
msg_type = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist" % (attribute)
msg = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist for feature %s. Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag)
msg = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist for feature %s. "\
"Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag)
self.handle_message("warning", msg_type, msg, None)
# create a locus tag base on the prefix + LOCUS + incremented number
if not locus_tag:
Expand Down Expand Up @@ -854,7 +855,10 @@ def set_classification(self, classification = None, strain = None, environmental
if not strain and not environmental_sample and not isolate: #no information provided, let's ask the user
onekey = None
while not onekey:
sys.stderr.write("At least one of the following qualifiers \"strain, environmental_sample, isolate\" must exist when organism belongs to Bacteria. Please fill one of those information.(source feature keys containing the /environmental_sample qualifier should also contain the /isolation_source qualifier. entries including /environmental_sample must not include the /strain qualifier)\nStrain:")
sys.stderr.write("At least one of the following qualifiers \"strain, environmental_sample, isolate\" must exist " \
"when organism belongs to Bacteria. Please fill one of those information.(source feature keys containing "\
"the /environmental_sample qualifier should also contain the /isolation_source qualifier. entries including "\
"/environmental_sample must not include the /strain qualifier)\nStrain:")
strain = raw_input()
if strain:
EMBL.PREVIOUS_VALUES["strain"]=strain
Expand Down Expand Up @@ -1340,6 +1344,14 @@ def main():
infile.seek(0, 0)

for record in GFF.parse(infile, base_dict=seq_dict):

# Check existence of gff seqid among the fasta sequence identifiers
if record.id not in seq_dict:
logging.warning("Sequence id <%s> from the gff file not found within the fasta file. Are you sure to provide the correct" \
" fasta file? The tool will create a string of ???? as sequence (its length will be the end position of the last feature). " \
"For you information, if you use the --translate option the tool will raise an error due to ??? codons that do not exist." % (record.id))

# Check sequence size and skip if < 100 bp
if len(record.seq)<100:
logging.warning("Sequence %s too short (%s bp)! Minimum accpeted by ENA is 100, we skip it !" % (record.name, len(record.seq) ) )
continue
Expand Down Expand Up @@ -1383,6 +1395,7 @@ def main():
writer.write_all( outfile )

writer = None
EMBL.print_progress(True)
if args.progress:
EMBL.print_progress(True)

sys.stderr.write( """Conversion done\n""")
10 changes: 10 additions & 0 deletions EMBLmyGFF3/modules/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,16 @@ def _infer_ORFs(self, feature):

# basic info
strand = self.location.strand
# raise an error if no strand for the CDS. Strand is not mandatory (can be a dot) except for CDS where it has an
# impact on the translation, and to check where is start and stop codon...
if strand == None:
ID=''
for qualifier in self.feature.qualifiers:
if 'id' == qualifier.lower():
ID = "%s" % " ".join(self.feature.qualifiers[qualifier])
break
logging.error('CDS %s does not have any strand! Please check your gff file.' % ID)
sys.exit()

if start_codon.upper() not in codon_table.start_codons:
self.location = self._set_before(self.location)
Expand Down
33 changes: 23 additions & 10 deletions EMBLmyGFF3/modules/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,36 @@ def __repr__(self):

output = ""
suffix = ""
complement = True
if len(self.location.parts) == len([l for l in self.location.parts if l.strand < 0]):

# Store all strand from the location parts to check potential inconsistency
strand=[]
for l in self.location.parts:
if l.strand != None:
if l.strand not in strand:
strand.append(l.strand)

if len(strand) == 0:
logging.debug("No strand stored among the location_parts %s" % self.location.parts)
elif len(strand) > 1:
logging.error("Different strand stored in location_parts (+ strand will be used as default): %s" % self.location.parts)
elif strand == [1]:
logging.debug("+ strand")
else:
logging.debug("- strand")
output += "complement("
suffix += ")"
complement = False

# If more than one part let's join the differnt parts together
if (len(self.location.parts) > 1):
output += "join("
suffix += ")"
output += ",".join(self._format_parts(self.location.parts, complement=complement))

output += ",".join(self._format_parts(self.location.parts))

return output + suffix

def _format_parts(self, parts, complement = True):
def _format_parts(self, parts):
output = []
for part in parts:
if part.strand > 0 or complement == False:
output += ["%s..%s" % (type(part.start)(part.start+1), type(part.end)(part.end+0))]
else:
output += ["complement(%s..%s)" % (type(part.start)(part.start+1), type(part.end)(part.end+0))]
return output
output += ["%s..%s" % (type(part.start)(part.start+1), type(part.end)(part.end+0))]
return output
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

setup(
name='EMBLmyGFF3',
version='1.2.6',
version='1.2.8',

description='An efficient way to convert gff3 annotation files into EMBL format ready to submit',

url='https://github.com/NBISweden/EMBLmyGFF3',
download_url='https://github.com/NBISweden/EMBLmyGFF3/archive/v1.2.6.tar.gz',
download_url='https://github.com/NBISweden/EMBLmyGFF3/archive/v1.2.8.tar.gz',
author='Martin Norling, Niclas Jareborg, Jacques Dainat',

license='GPL-3.0',
Expand Down

0 comments on commit 789f25f

Please sign in to comment.