"""
===============
seqnumbering
===============
Deals with sequence numbering conversions.
"""
import collections
import tempfile
import re
import Bio.SeqIO
import Bio.SeqUtils
from dms_tools2 import NTS
[docs]class TranscriptConverter:
"""Convert sites in mRNA transcript to chromosome or CDS sites.
In mRNA sequencing, we identify mutations with respect
to their position in the mRNA. But we may want map them
to the corresponding numbers in the entire chromosome
(gene segment in case of a segmented virus, or entire
viral genome in the case of a non-segmented virus), or
to CDSs encoded on that chromosome.
For all site numbers used as input and output for this
class, numbering is assumed to be 1, 2, .... Note that
this is different than the 0, 1, ... numbering used
by Python for strings.
Args:
`genbankfile` (str)
Genbank file with one or more loci, each of which
should be a separate chromosome. The features
of relevance are called `mRNA` and `CDS`.
Each of these features should have a qualifier
called `label` that gives its name.
`ignore_other_features` (bool)
If `genbankfile` contains features not `mRNA`
or `CDS`, ignore them or raise error?
`to_upper` (bool)
Convert all sequences to upper case letters?
Attributes:
`chromosomes` (dict)
Keyed by chromosome names, values are
`Bio.SeqRecord.SeqRecord` for chromosomes.
`mRNAs` (dict)
Keyed by mRNA names, values are
`Bio.SeqFeature.SeqFeature` for mRNA.
`CDSs` (dict)
Keyed by CDS names, values are
`Bio.SeqFeature.SeqFeature` for CDS.
`mRNA_chromosome` (dict)
Keyed by mRNA name, values is name of chromosome
containing mRNA.
`chromosome_CDSs` (dict)
Keyed by chromosome names, values are names of
CDSs on chromosome.
Specify example `genbankfile` contents with required information.
In this file, the chromosome is `fluNS`, and it encodes two mRNAs
and two CDSs (for `fluNS1` and `fluNS2`):
>>> genbank_text = '''
... LOCUS fluNS 890 bp DNA UNK 01-JAN-1980
... FEATURES Location/Qualifiers
... mRNA 2..868
... /label="fluNS1"
... mRNA join(2..56,529..868)
... /label="fluNS2"
... CDS 27..719
... /label="fluNS1"
... CDS join(27..56,529..864)
... /label="fluNS2"
... ORIGIN
... 1 agcaaaagca gggtgacaaa gacataatgg atccaaacac tgtgtcaagc tttcaggtag
... 61 attgctttct ttggcatgtc cgcaaaagag ttgcagacca agaactaggt gatgccccat
... 121 tccttgatcg gcttcgccga gatcagaagt ccctaagagg aagaggcagc actcttggtc
... 181 tggacatcga aacagccacc cgtgctggaa agcaaatagt ggagcggatt ctgaaggaag
... 241 aatctgatga ggcactcaaa atgaccatgg cctctgtacc tgcatcgcgc tacctaactg
... 301 acatgactct tgaggaaatg tcaaggcact ggttcatgct catgcccaag cagaaagtgg
... 361 caggccctct ttgtatcaga atggaccagg cgatcatgga taagaacatc atactgaaag
... 421 cgaacttcag tgtgattttt gaccggctgg agactctaat attactaagg gccttcaccg
... 481 aagaggggac aattgttggc gaaatttcac cactgccctc tcttccagga catactgatg
... 541 aggatgtcaa aaatgcagtt ggggtcctca tcggaggact tgaatggaat aataacacag
... 601 ttcgagtctc tgaaactcta cagagattcg cttggagaag cagtaatgag aatgggagac
... 661 ctccactcac tccaaaacag aaacggaaaa tggcgggaac aattaggtca gaagtttgaa
... 721 gaaataaggt ggttgattga agaagtgaga cacagactga agataacaga gaatagtttt
... 781 gagcaaataa catttatgca agccttacaa ctattgcttg aagtggagca agagataaga
... 841 actttctcgt ttcagcttat ttaataataa aaaacaccct tgtttctact
... //
... '''
Now initialize a :class:`TranscriptConverter`:
>>> with tempfile.NamedTemporaryFile(mode='r+') as genbankfile:
... _ = genbankfile.write(genbank_text)
... genbankfile.flush()
... _ = genbankfile.seek(0)
... converter = TranscriptConverter(genbankfile)
Confirm resulting :class:`TranscriptConverter` contains one
chromosome (`fluNS`) with the expected to mRNAs and CDSs:
>>> list(converter.chromosomes.keys())
['fluNS']
>>> sorted(converter.CDSs.keys())
['fluNS1', 'fluNS2']
>>> sorted(converter.mRNAs.keys())
['fluNS1', 'fluNS2']
>>> converter.mRNA_chromosome['fluNS1']
'fluNS'
>>> converter.mRNA_chromosome['fluNS2']
'fluNS'
>>> converter.chromosome_CDSs['fluNS']
['fluNS1', 'fluNS2']
Get site in chromosome (`fluNS`) that corresponds to a
position in the `fluNS1` mRNA, and then do the same for
the `fluNS2` mRNA, using
:class:`TranscriptConverter.i_mRNAtoChromosome`. Then
check nucleotide identities with
:class:`TranscriptConverter.ntIdentity`:
>>> converter.i_mRNAtoChromosome('fluNS1', 60)
61
>>> converter.ntIdentity('fluNS', 61)
'A'
>>> converter.i_mRNAtoChromosome('fluNS2', 58)
531
>>> converter.ntIdentity('fluNS', 531)
'C'
Do the same for substrings of `fluNS1` and `fluNS2`:
>>> converter.i_mRNAtoChromosome('fluNS1', 2, mRNAfragment='GATTGCTTTCT')
61
>>> converter.i_mRNAtoChromosome('fluNS2', 9, mRNAfragment='TTTCAGGACATACTGATG')
531
Get amino-acid substitutions caused by chromosome point mutation.
We can specify point mutation as string or tuple, and get
output with 3 or 1-letter amino-acid codes:
>>> converter.aaSubstitutions('fluNS', 'A61T')
'fluNS1-Asp12Val'
>>> converter.aaSubstitutions('fluNS', ('A', 61, 'T'))
'fluNS1-Asp12Val'
>>> converter.aaSubstitutions('fluNS', 'A61T', aa_3letter=False)
'fluNS1-D12V'
Now look at a point mutation that affects `fluNS1` and `fluNS2`, but
only causes an amino-acid substitution in the former (is synonymous
in the latter):
>>> converter.aaSubstitutions('fluNS', 'C531T')
'fluNS1-His169Tyr'
Now mutation that causes amino-acid substitutions in
`fluNS1` and `fluNS2`:
>>> converter.aaSubstitutions('fluNS', 'A532T')
'fluNS1-His169Leu_fluNS2-Ile12Leu'
"""
def __init__(self, genbankfile, *,
ignore_other_features=False, to_upper=True):
"""See main class docstring."""
self.to_upper = to_upper
self.chromosomes = {c.name:c for c in
Bio.SeqIO.parse(genbankfile, 'genbank')}
self.CDSs = {}
self.mRNAs = {}
self.mRNA_chromosome = {}
self.chromosome_CDSs = collections.defaultdict(list)
for c in self.chromosomes.values():
if to_upper:
c.seq = c.seq.upper()
for feature in c.features:
if feature.type in {'mRNA', 'CDS'}:
name = feature.qualifiers['label']
if len(name) != 1:
raise ValueError("multiple 'label' qualifiers")
feature.name = name[0]
if feature.type == 'mRNA':
d = self.mRNAs
self.mRNA_chromosome[feature.name] = c.name
elif feature.type == 'CDS':
d = self.CDSs
self.chromosome_CDSs[c.name].append(feature.name)
else:
raise RuntimeError('should not get here')
if feature.name in d:
raise ValueError("duplicate {0} {1}".format(
feature.type, feature.name))
d[feature.name] = feature
elif not ignore_other_features:
raise ValueError("unrecognized feature type {0}"
.format(feature.type))
[docs] def i_mRNAtoChromosome(self, mRNA, i, *, mRNAfragment=None):
"""Convert site number in mRNA to number in chromosome.
Args:
`mRNA` (str)
Name of a valid mRNA.
`i` (int)
Site number in mRNA.
`mRNAfragment` (str or `None`)
Substring sequence of `mRNA`. In this case, `i`
is taken as the site in this substring of the
`mRNA`. Useful because sometimes mutations may
be called in a fragment of the full mRNA. Set
to `None` if `i` is site in full mRNA.
Returns:
Site number in chromosome that contains `mRNA`.
"""
try:
chromosome = self.chromosomes[self.mRNA_chromosome[mRNA]]
mRNA_feature = self.mRNAs[mRNA]
except KeyError:
raise ValueError("invalid `mRNA` {0}".format(mRNA))
mRNA_seq = mRNA_feature.extract(chromosome).seq
if mRNAfragment:
if self.to_upper:
mRNAfragment = mRNAfragment.upper()
n = mRNA_seq.count(mRNAfragment)
if n == 1:
i += mRNA_seq.find(mRNAfragment)
elif n > 1:
raise ValueError("`mRNA` {0} does contains {1} "
"copies of `mRNAfragment`".format(mRNA, n))
else:
raise ValueError("`mRNA` {0} does not contain "
"specified `mRNAfragment`".format(mRNA))
mRNA_sites = list(mRNA_feature)
if not (1 <= i <= len(mRNA_sites)):
raise ValueError("`i` of {0} not valid site in `mRNA` {1}"
.format(i, mRNA))
return mRNA_sites[i - 1] + 1
[docs] def ntIdentity(self, chromosome, i):
"""Gets identity at site in chromosome.
Args:
`chromosome` (str)
Name of chromosome.
`i` (int)
Site in chromosome.
Returns:
Nucleotide in `chromosome` at site `i`.
"""
try:
chromosome_seq = self.chromosomes[chromosome]
except KeyError:
raise ValueError("`chromosome` {0} does not exist"
.format(chromosome))
if not (1 <= i <= len(chromosome_seq)):
raise ValueError("`i` of {0} not in `chromosome` {1}"
.format(i, chromosome))
return str(chromosome_seq[i - 1])
[docs] def aaSubstitutions(self, chromosome, mutation, *,
aa_3letter=True):
"""Amino-acid substitutions from point mutation to chromosome.
Gets all amino-acid substitutions in CDSs caused by a
point mutation to a chromosome.
Args:
`chromosome` (str)
Name of chromosome.
`mutation` (str or 3-tuple)
Point mutation in chromosome based numbering. A str
like "A50T", or tuple `(wt, i, mut)` where `wt` is
wildtype nt, `i` is site, and `mut` is mutant nt.
`aa_3letter` (bool)
Use 3-letter rather than 1-letter amino-acid codes?
Returns:
A string giving the amino-acid mutations:
- If no mutations, returns empty str.
- If one mutation, will be str like 'fluNS1-Asp12Val'.
- If several mutations, will be str like
'fluNS1-His169Leu_fluNS2-Ile12Leu'.
"""
try:
cds_names = self.chromosome_CDSs[chromosome]
chromosome_seq = self.chromosomes[chromosome].seq
except KeyError:
raise ValueError("invalid `chromosome` {0}".format(chromosome))
if isinstance(mutation, str):
m = re.match(r'^(?P<wt>[{0}])(?P<i>\d+)(?P<mut>[{0}])$'
.format(''.join(NTS)), mutation)
if not m:
raise ValueError("cannot parse mutation {0}"
.format(mutation))
mutation = (m.group('wt'), int(m.group('i')), m.group('mut'))
if (len(mutation) != 3) or (mutation[0] not in NTS
) or (mutation[2] not in NTS):
raise ValueError("invalid mutation {0}".format(mutation))
(wt, i, mut) = mutation
if not (1 <= i <= len(chromosome_seq)):
raise ValueError("`i` of {0} is not valid site in "
"`chromosome` {1}".format(i, chromosome))
if wt != self.ntIdentity(chromosome, i):
raise ValueError("Invalid wildtype nucleotide at site "
"{0} of {1}. Specified {2}, should be {3}".format(
i, chromosome, i, self.ntIdentity(chromosome, i)))
if wt == mut:
raise ValueError("wildtype and mutant nt are the same")
cds_w_mut = [cds for cds in [self.CDSs[cds_name]
for cds_name in cds_names] if (i - 1) in cds]
mutstring = []
mut_chromosome = None
for cds_name in cds_names:
cds = self.CDSs[cds_name]
if (i - 1) in cds:
if mut_chromosome is None:
mut_chromosome = chromosome_seq.tomutable()
mut_chromosome[i - 1] = mut
wtprot = cds.extract(chromosome_seq).translate()
mutprot = cds.extract(mut_chromosome).translate()
aa_mut = [(wt, r0 + 1, mut) for r0, (wt, mut) in
enumerate(zip(wtprot, mutprot)) if wt != mut]
if len(aa_mut) > 1:
raise ValueError(">1 amino-acid mutation")
elif aa_mut:
wt_aa, r, mut_aa = aa_mut[0]
if aa_3letter:
wt_aa = Bio.SeqUtils.seq3(wt_aa)
mut_aa = Bio.SeqUtils.seq3(mut_aa)
mutstring.append('{0}-{1}{2}{3}'.format(cds_name,
wt_aa, r, mut_aa))
return '_'.join(mutstring)
if __name__ == '__main__':
import doctest
doctest.testmod()