Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 47 additions & 7 deletions funannotate/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -2892,6 +2892,8 @@ def tbl2dict(input, fasta, Genes):
# now we need to sort coordinates, get protein/transcript sequences and capture UTRs
SeqRecords = SeqIO.to_dict(SeqIO.parse(fasta, 'fasta'))
for k, v in list(Genes.items()):
# @nextgenusfs we should clarify or rename this variable to indicate
# i is the i-th transcript, right??
for i in range(0, len(v['ids'])):
if v['type'] in ['mRNA', 'tRNA', 'ncRNA']:
if v['strand'] == '+':
Expand All @@ -2908,12 +2910,28 @@ def tbl2dict(input, fasta, Genes):
else:
sortedCDS = sorted(v['CDS'][i], key=lambda tup: tup[0],
reverse=True)
# get the codon_start by getting first CDS phase + 1

cdsSeq = getSeqRegions(SeqRecords, v['contig'], sortedCDS)
# If the translation starts in middle of a codon,
# we need to truncate the CDS seq either at start or end
# depending on strand.
if v['codon_start'][i] > 1:
if v['strand'] == "+":
# drop first N bases based on codon_start
# to reflect the translation frame
cdsSeq = cdsSeq[v['codon_start'][i]-1:]
elif v['strand'] == "-":
# drop last N bases based on codon_start
# to reflect the translation frame (because this is
# is reverse strand gene)
endTrunc = len(cdsSeq) - (v['codon_start'][i] - 1)
cdsSeq = cdsSeq[0:endTrunc]
else:
# could trigger more of a warning/error
print("ERROR strand (%s) is nonsensical for %s"%(v['strand'],k))
Genes[k]['cds_transcript'].append(cdsSeq)
Genes[k]['CDS'][i] = sortedCDS
protSeq, codon_start = (None,)*2
protSeq = translate(cdsSeq, v['strand'], v['codon_start'][i]-1)
protSeq = translate(cdsSeq, v['strand'],0)
if protSeq:
Genes[k]['protein'].append(protSeq)
if protSeq.endswith('*'):
Expand Down Expand Up @@ -4501,8 +4519,7 @@ def gff2dict(file, fasta, Genes, debug=False, gap_filter=False):
cdsSeq = getSeqRegions(SeqRecords, v['contig'], sortedCDS)
if gap_filter:
cdsSeq, v['CDS'][i] = start_end_gap(cdsSeq, v['CDS'][i])
Genes[k]['cds_transcript'].append(cdsSeq)
Genes[k]['CDS'][i] = sortedCDS

protSeq, codon_start = (None,)*2
try:
currentphase = v['phase'][i]
Expand Down Expand Up @@ -4530,6 +4547,16 @@ def gff2dict(file, fasta, Genes, debug=False, gap_filter=False):
#translate and get protein sequence
protSeq = translate(cdsSeq, v['strand'], codon_start-1)
Genes[k]['codon_start'][i] = codon_start
if codon_start > 1:
if v['strand'] == '+':
cdsSeq = cdsSeq[codon_start - 1:]
elif v['strand'] == '-':
endTrunc = len(cdsSeq) - codon_start -1
cdsSeq = cdsSeq[0:endTrunc]
else:
print("ERROR nonsensical strand (%s) for gene %s"%([v['strand'],k]))
Genes[k]['cds_transcript'].append(cdsSeq)
Genes[k]['CDS'][i] = sortedCDS
v['protein'].append(protSeq)
if protSeq:
if protSeq.endswith('*'):
Expand Down Expand Up @@ -6273,9 +6300,12 @@ def zff2gff3(input, fasta, output):
indexStart = [x for x, y in enumerate(
v['CDS'][i]) if y[0] == sortedCDS[0][0]]
cdsSeq = getSeqRegions(SeqRecords, v['contig'], sortedCDS)
Genes[k]['cds_transcript'].append(cdsSeq)
# this seems to be missing removal of codon_start overhang?

Genes[k]['CDS'][i] = sortedCDS
protSeq, codon_start = (None,)*2

protSeq,codon_start = (None,)*2

if '?' in v['phase'][i]: # dont know the phase -- malformed GFF3, try to find best CDS
translateResults = []
for y in [1, 2, 3]:
Expand All @@ -6290,8 +6320,18 @@ def zff2gff3(input, fasta, output):
else:
codon_start = int(v['phase'][i][indexStart[0]]) + 1
# translate and get protein sequence
cdsSeq = cdsSeq[codon_start-1:]
protSeq = translate(cdsSeq, v['strand'], codon_start-1)
Genes[k]['codon_start'][i] = codon_start
if codon_start > 1:
if v['strand'] == '+':
cdsSeq = cdsSeq[codon_start - 1:]
elif v['strand'] == '-':
endTrunc = len(cdsSeq) - codon_start -1
cdsSeq = cdsSeq[0:endTrunc]
else:
print("ERROR nonsensical strand (%s) for gene %s"%([v['strand'],k]))
Genes[k]['cds_transcript'].append(cdsSeq)
if protSeq:
Genes[k]['protein'].append(protSeq)
if protSeq.endswith('*'):
Expand Down