"""
===================
utils
===================
Miscellaneous utilities for ``dms_tools2``.
"""
import os
import math
import sys
import time
import platform
import importlib
import logging
import tempfile
import textwrap
import itertools
import collections
import random
import re
import pysam
import numpy
import scipy.misc
import scipy.special
import pandas
import gzip
import dms_variants.codonvarianttable
import dms_tools2
from dms_tools2 import CODONS, CODON_TO_AA, AAS_WITHSTOP, AA_TO_CODONS, NTS
import dms_tools2._cutils
[docs]def sessionInfo():
"""Returns string with information about session / packages."""
s = [
'Version information:',
'\tTime and date: {0}'.format(time.asctime()),
'\tPlatform: {0}'.format(platform.platform()),
'\tPython version: {0}'.format(
sys.version.replace('\n', ' ')),
'\tdms_tools2 version: {0}'.format(dms_tools2.__version__),
]
for modname in ['Bio', 'pandas', 'numpy', 'IPython',
'jupyter', 'matplotlib', 'plotnine', 'natsort', 'pystan',
'scipy', 'seaborn', 'phydmslib', 'statsmodels', 'rpy2',
'regex', 'umi_tools']:
try:
v = importlib.import_module(modname).__version__
s.append('\t{0} version: {1}'.format(modname, v))
except AttributeError:
s.append('\t{0} version unknown'.format(modname))
except ImportError:
s.append("\t{0} cannot be imported".format(modname))
return '\n'.join(s)
[docs]def initLogger(logfile, prog, args):
"""Initialize output logging for scripts.
Args:
`logfile` (str or `sys.stdout`)
Name of file to which log is written, or
`sys.stdout` if you just want to write information
to standard output.
`prog` (str)
Name of program for which we are logging.
`args` (dict)
Program arguments as arg / value pairs.
Returns:
If `logfile` is a string giving a file name, returns
an opened and initialized `logging.Logger`. If `logfile`
is `sys.stdout`, then writes information to `sys.stdout`.
In either case, basic information is written about the program
and args.
"""
if logfile == sys.stdout:
logfile.write("Beginning execution of {0} in directory {1}\n\n".format(
prog, os.getcwd()))
logfile.write("{0}\n\n".format(sessionInfo()))
logfile.write("Parsed the following arguments:\n\t{0}\n\n".format(
'\n\t'.join(['{0} = {1}'.format(arg, val) for (arg, val)
in args.items()])))
else:
if os.path.isfile(logfile):
os.remove(logfile)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(prog)
logfile_handler = logging.FileHandler(logfile)
logger.addHandler(logfile_handler)
formatter = logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s')
logfile_handler.setFormatter(formatter)
try:
logger.info("Beginning execution of {0} in directory {1}\n"
.format(prog, os.getcwd()))
logger.info("Progress is being logged to {0}".format(logfile))
logger.info("{0}\n".format(sessionInfo()))
logger.info("Parsed the following arguments:\n\t{0}\n".format(
'\n\t'.join(['{0} = {1}'.format(arg, val) for (arg, val)
in args.items()])))
except:
logger.exception("Error")
raise
return logger
[docs]def iteratePairedFASTQ(r1files, r2files, r1trim=None, r2trim=None):
"""Iterates over FASTQ files for single or paired-end sequencing.
Args:
`r1files` (list or str)
Name of R1 FASTQ file or list of such files. Can optionally
be gzipped.
`r2files` (list or str or `None`)
Like `r1files` but for R2 files, or `None` if no R2.
`r1trim` (int or `None`)
If not `None`, trim `r1` and `q1` to be no longer than this.
`r2trim` (int or `None`)
Like `r1trim` but for R2.
Returns:
Each iteration returns `(name, r1, r2, q1, q2, fail)` where:
- `name` is a string giving the read name
- `r1` and `r2` are strings giving the reads; `r2`
is `None` if no R2.
- `q1` and `q2` are strings giving the PHRED Q scores;
`q2` is none if no R2.
- `fail` is `True` if either read failed Illumina chastity
filter, `False` if both passed, `None` if info not present.
We run a simple test by first writing an example FASTQ file and
then testing on it.
>>> n1_1 = '@DH1DQQN1:933:HMLH5BCXY:1:1101:2165:1984 1:N:0:CGATGT'
>>> r1_1 = 'ATGCAATTG'
>>> q1_1 = 'GGGGGIIII'
>>> n2_1 = '@DH1DQQN1:933:HMLH5BCXY:1:1101:2165:1984 2:N:0:CGATGT'
>>> r2_1 = 'CATGCATA'
>>> q2_1 = 'G<GGGIII'
>>> tf = tempfile.NamedTemporaryFile
>>> with tf(mode='w') as r1file, tf(mode='w') as r2file:
... _ = r1file.write('\\n'.join([
... n1_1, r1_1, '+', q1_1,
... n1_1.replace(':N:', ':Y:'), r1_1, '+', q1_1,
... n1_1.split()[0], r1_1, '+', q1_1,
... ]))
... r1file.flush()
... _ = r2file.write('\\n'.join([
... n2_1, r2_1, '+', q2_1,
... n2_1, r2_1, '+', q2_1,
... n2_1, r2_1, '+', q2_1,
... ]))
... r2file.flush()
... itr = iteratePairedFASTQ(r1file.name, r2file.name, r1trim=4, r2trim=5)
... next(itr) == (n1_1.split()[0][1 : ], r1_1[ : 4],
... r2_1[ : 5], q1_1[ : 4], q2_1[ : 5], False)
... next(itr) == (n1_1.split()[0][1 : ], r1_1[ : 4],
... r2_1[ : 5], q1_1[ : 4], q2_1[ : 5], True)
... next(itr) == (n1_1.split()[0][1 : ], r1_1[ : 4],
... r2_1[ : 5], q1_1[ : 4], q2_1[ : 5], None)
True
True
True
Now do the same test but for just R1:
>>> with tf(mode='w') as r1file:
... _ = r1file.write('\\n'.join([
... n1_1, r1_1, '+', q1_1,
... n1_1.replace(':N:', ':Y:'), r1_1, '+', q1_1,
... n1_1.split()[0], r1_1, '+', q1_1,
... ]))
... r1file.flush()
... itr_R1 = iteratePairedFASTQ(r1file.name, None, r1trim=4)
... next(itr_R1) == (n1_1.split()[0][1 : ], r1_1[ : 4],
... None, q1_1[ : 4], None, False)
... next(itr_R1) == (n1_1.split()[0][1 : ], r1_1[ : 4],
... None, q1_1[ : 4], None, True)
... next(itr_R1) == (n1_1.split()[0][1 : ], r1_1[ : 4],
... None, q1_1[ : 4], None, None)
True
True
True
"""
if isinstance(r1files, str):
r1files = [r1files]
if r2files is not None:
r2files = [r2files]
if not all(map(os.path.isfile, r1files)):
raise ValueError('cannot find all `r1files`')
if r2files is None:
r2files = [None] * len(r1files)
elif len(r1files) != len(r2files):
raise ValueError('`r1files` and `r2files` differ in length')
elif not all(map(os.path.isfile, r2files)):
raise ValueError('cannot find all `r2files`')
for (r1file, r2file) in zip(r1files, r2files):
r1reader = pysam.FastxFile(r1file)
if r2file is None:
read_iterator = r1reader
else:
r2reader = pysam.FastxFile(r2file)
read_iterator = zip(r1reader, r2reader)
for tup in read_iterator:
if r2file is None:
a1 = tup
r2 = q2 = None
else:
a1, a2 = tup
r2 = a2.sequence
q2 = a2.quality
if a2.comment is not None:
id2 = f"{a2.name} {a2.comment}".split()
else:
id2 = a2.name.split()
name2 = id2[0]
r1 = a1.sequence
q1 = a1.quality
if a1.comment is not None:
id1 = f"{a1.name} {a1.comment}".split()
else:
id1 = a1.name.split()
name1 = id1[0]
if r2file is not None:
# trims last two chars, need for SRA downloaded files
if name1[-2 : ] == '.1' and name2[-2 : ] == '.2':
name1 = name1[ : -2]
name2 = name2[ : -2]
if name1 != name2:
raise ValueError(f"name mismatch {name1} vs {name2}")
# parse chastity filter assuming CASAVA 1.8 header
try:
f1 = id1[1][2]
if r2file is None:
f2 = 'N'
else:
f2 = id2[1][2]
if f1 == 'N' and f2 == 'N':
fail = False
elif f1 in ['N', 'Y'] and f2 in ['N', 'Y']:
fail = True
except IndexError:
fail = None # header does not specify chastity filter
if r1trim is not None:
r1 = r1[ : r1trim]
q1 = q1[ : r1trim]
if (r2trim is not None) and (r2file is not None):
r2 = r2[ : r2trim]
q2 = q2[ : r2trim]
yield (name1, r1, r2, q1, q2, fail)
[docs]def lowQtoN(r, q, minq, use_cutils=True):
"""Replaces low quality nucleotides with ``N`` characters.
Args:
`r` (str)
A string representing a sequencing read.
`q` (str)
String of same length as `r` holding Q scores
in Sanger ASCII encoding.
`minq` (length-one string)
Replace all positions in `r` where `q` is < this.
`use_cutils` (bool)
Use the faster implementation in the `_cutils` module.
Returns:
A version of `r` where all positions `i` where
`q[i] < minq` have been replaced with ``N``.
>>> r = 'ATGCAT'
>>> q = 'GB<.0+'
>>> minq = '0'
>>> lowQtoN(r, q, minq) == 'ATGNAN'
True
"""
if use_cutils:
return dms_tools2._cutils.lowQtoN(r, q, minq)
assert len(r) == len(q)
return ''.join([ri if qi >= minq else 'N'
for (ri, qi) in zip(r, q)])
[docs]def buildReadConsensus(reads, minreads, minconcur, use_cutils=True):
"""Builds consensus sequence of some reads.
You may want to pre-fill low-quality sites with ``N``
using `lowQtoN`. An ``N`` is considered a non-called identity.
Args:
`reads` (list)
List of reads as strings. If reads are not all same
length, shorter ones are extended from 3' end with ``N``
to match maximal length.
`minreads` (int)
Only call consensus at a site if at least this many reads
have called identity.
`minconcur` (float)
Only call consensus at site if >= this fraction of called
identities agree.
`use_cutils` (bool)
Use the faster implementation in the `_cutils` module.
Returns:
A string giving the consensus sequence. Non-called
sites are returned as ``N```.
>>> reads = ['ATGCAT',
... 'NTGNANA',
... 'ACGNNTAT',
... 'NTGNTA']
>>> buildReadConsensus(reads, 2, 0.75) == 'ATGNNNAN'
True
>>> reads.append('CTGCATAT')
>>> buildReadConsensus(reads, 2, 0.75) == 'NTGCATAT'
True
"""
if use_cutils:
return dms_tools2._cutils.buildReadConsensus(reads,
minreads, minconcur)
readlens = list(map(len, reads))
maxlen = max(readlens)
consensus = []
for i in range(maxlen):
counts = {}
for (r, lenr) in zip(reads, readlens):
if lenr > i:
x = r[i]
if x != 'N':
if x in counts:
counts[x] += 1
else:
counts[x] = 1
ntot = sum(counts.values())
if ntot < minreads:
consensus.append('N')
else:
(nmax, xmax) = sorted([(n, x) for (x, n) in counts.items()])[-1]
if nmax / float(ntot) >= minconcur:
consensus.append(xmax)
else:
consensus.append('N')
return ''.join(consensus)
[docs]def rarefactionCurve(barcodes, *, maxpoints=1e5, logspace=True):
"""Rarefaction curve from list of barcodes.
Uses the analytical formula for the rarefaction curve defined
`on Wikipedia <https://en.wikipedia.org/wiki/Rarefaction_(ecology)#Derivation>`_.
Args:
`barcodes` (list or pandas Series)
Holds the list of unique barcodes for which we calculate
the rarefaction curve. It is expected that some of these
barcodes will be repeated multiple times in the list if
the sampling is approaching saturation.
`maxpoints` (int)
Only calculate values at this many points. The benefit
of this is that it can become very costly to calculate
the curve at every point when there are many points.
`logspace` (True)
Logarithmically space the `maxpoints` points for
the calculation. This will give better results if
we are subsampling and the curve saturates. Only
done if we have to subsample.
Returns:
The 2-tuple `(nreads, nbarcodes)`, where both `nreads` and
`nbarcodes` are lists of the same length, and `nbarcodes[i]`
is the expected number of barcodes observed when there are
`nreads[i]` reads.
Here we take a very small list and show that the results given
by the function are equivalent to those obtained by random
subsampling:
>>> barcodes = ['A', 'A', 'A', 'A', 'G', 'G', 'C', 'T']
>>> (nreads, nbarcodes) = rarefactionCurve(barcodes)
>>> random.seed(1)
>>> nrand = 100000
>>> sim_equal_calc = []
>>> for n in range(1, len(barcodes) + 1):
... nbarcodes_sim = sum([len(set(random.sample(barcodes, n)))
... for _ in range(nrand)]) / nrand
... sim_equal_calc.append(numpy.allclose(nbarcodes_sim,
... nbarcodes[nreads.index(n)], atol=1e-2))
>>> all(sim_equal_calc)
True
"""
N = len(barcodes) # total number of items
Ni = collections.Counter(barcodes)
K = len(Ni)
Mj = collections.Counter(Ni.values())
Nk, num = map(numpy.array, zip(*Mj.items()))
# use simplification that (N - Ni)Cr(n) / (N)Cr(n) =
# [(N - Ni)! * (N - n)!] / [N! * (N - Ni - n)!]
#
# Also use fact that gamma(x + 1) = x!
nbarcodes = []
lnFactorial_N = scipy.special.gammaln(N + 1)
if logspace and N > maxpoints:
nreads = list(numpy.unique(numpy.logspace(
math.log10(1), math.log10(N),
num=int(min(N, maxpoints))).astype('int')))
else:
nreads = list(numpy.unique(numpy.linspace(
1, N, num=min(N, maxpoints)).astype('int')))
for n in nreads:
lnFactorial_N_minus_n = scipy.special.gammaln(N - n + 1)
i = numpy.nonzero(N - Nk - n >= 0) # indices where this is true
nbarcodes.append(
K - (num[i] * numpy.exp(
scipy.special.gammaln(N - Nk[i] + 1) +
lnFactorial_N_minus_n -
lnFactorial_N -
scipy.special.gammaln(N - Nk[i] - n + 1))
).sum()
)
return (nreads, nbarcodes)
[docs]def reverseComplement(s, use_cutils=True):
"""Gets reverse complement of DNA sequence `s`.
Args:
`s` (str)
Sequence to reverse complement.
`use_cutils` (bool)
Use the faster implementation in the `_cutils` module.
Returns:
Reverse complement of `s` as a str.
>>> s = 'ATGCAAN'
>>> reverseComplement(s) == 'NTTGCAT'
True
"""
if use_cutils:
return dms_tools2._cutils.reverseComplement(s)
return ''.join(reversed([dms_tools2.NTCOMPLEMENT[nt] for nt in s]))
[docs]def alignSubamplicon(refseq, r1, r2, refseqstart, refseqend, maxmuts,
maxN, chartype, use_cutils=True):
"""Try to align subamplicon to reference sequence at defined location.
Tries to align reads `r1` and `r2` to `refseq` at location
specified by `refseqstart` and `refseqend`. Determines how many
sites of type `chartype` have mutations, and if <= `maxmuts` conside
the subamplicon to align if fraction of ambiguous nucleotides <= `maxN`.
In `r1` and `r2`, an ``N`` indicates a non-called ambiguous identity.
If the reads disagree in a region of overlap that is set to ``N`` in
the final subamplicon, but if one read has ``N`` and the other a called
identity, then the called identity is used in the final subamplicon.
Args:
`refseq` (str)
Sequence to which we align. if `chartype` is 'codon',
must be a valid coding (length multiple of 3).
`r1` (str)
The forward sequence to align.
`r2` (str)
The reverse sequence to align. When reverse complemented,
should read backwards in `refseq`.
`refseqstart` (int)
The nucleotide in `refseq` (1, 2, ... numbering) where the
first nucleotide in `r1` aligns.
`refseqend` (int)
The nucleotide in `refseq` (1, 2, ... numbering) where the
first nucleotide in `r2` aligns (note that `r2` then reads
backwards towards the 5' end of `refseq`).
`maxmuts` (int or float)
Maximum number of mutations of character `chartype` that
are allowed in the aligned subamplicons from the two reads.
`maxN` (int or float)
Maximum number of nucleotides for which we allow
ambiguous (``N``) identities in final subamplicon.
`chartype` (str)
Character type for which we count mutations.
Currently, the only allowable value is 'codon'.
`use_cutils` (bool)
Use the faster implementation in the `_cutils` module.
Returns:
If reads align, return aligned subamplicon as string (of length
`refseqend - refseqstart + 1`). Otherwise return `None`.
>>> refseq = 'ATGGGGAAA'
>>> s = alignSubamplicon(refseq, 'GGGGAA', 'TTTCCC', 3, 9, 1, 1, 'codon')
>>> s == 'GGGGAAA'
True
>>> s = alignSubamplicon(refseq, 'GGGGAA', 'TTTCCC', 1, 9, 1, 1, 'codon')
>>> s == False
True
>>> s = alignSubamplicon(refseq, 'GGGGAT', 'TTTCCC', 3, 9, 1, 0, 'codon')
>>> s == False
True
>>> s = alignSubamplicon(refseq, 'GGGGAT', 'TTTCCC', 3, 9, 1, 1, 'codon')
>>> s == 'GGGGANA'
True
>>> s = alignSubamplicon(refseq, 'GGGGAT', 'TATCCC', 3, 9, 1, 0, 'codon')
>>> s == 'GGGGATA'
True
>>> s = alignSubamplicon(refseq, 'GGGGAT', 'TATCCC', 3, 9, 0, 0, 'codon')
>>> s == False
True
>>> s = alignSubamplicon(refseq, 'GGGNAA', 'TTTCCC', 3, 9, 0, 0, 'codon')
>>> s == 'GGGGAAA'
True
>>> s = alignSubamplicon(refseq, 'GGGNAA', 'TTNCCC', 3, 9, 0, 0, 'codon')
>>> s == 'GGGGAAA'
True
>>> s = alignSubamplicon(refseq, 'GTTTAA', 'TTTAAA', 3, 9, 1, 0, 'codon')
>>> s == 'GTTTAAA'
True
>>> s = alignSubamplicon(refseq, 'GGGGTA', 'TTACCC', 3, 9, 1, 0, 'codon')
>>> s == 'GGGGTAA'
True
>>> s = alignSubamplicon(refseq, 'GGGCTA', 'TTAGCC', 3, 9, 1, 0, 'codon')
>>> s == False
True
"""
r2 = reverseComplement(r2)
if use_cutils:
return dms_tools2._cutils.alignSubamplicon(refseq, r1, r2,
refseqstart, refseqend, maxmuts, maxN, chartype)
assert chartype in ['codon'], "Invalid chartype"
if chartype == 'codon':
assert len(refseq) % 3 == 0, "refseq length not divisible by 3"
len_subamplicon = refseqend - refseqstart + 1
len_r1 = len(r1)
len_subamplicon_minus_len_r2 = len_subamplicon - len(r2)
subamplicon = []
for i in range(len_subamplicon):
if i < len_subamplicon_minus_len_r2: # site not in r2
if i < len_r1: # site in r1
subamplicon.append(r1[i])
else: # site not in r1
subamplicon.append('N')
else: # site in r2
if i < len_r1: # site in r1
r1i = r1[i]
r2i = r2[i - len_subamplicon_minus_len_r2]
if r1i == r2i:
subamplicon.append(r1i)
elif r1i == 'N':
subamplicon.append(r2i)
elif r2i == 'N':
subamplicon.append(r1i)
else:
subamplicon.append('N')
else: # site not in r1
subamplicon.append(r2[i - len_subamplicon_minus_len_r2])
subamplicon = ''.join(subamplicon)
if subamplicon.count('N') > maxN:
return False
if chartype == 'codon':
if refseqstart % 3 == 1:
startcodon = (refseqstart + 2) // 3
codonshift = 0
elif refseqstart % 3 == 2:
startcodon = (refseqstart + 1) // 3 + 1
codonshift = 2
elif refseqstart % 3 == 0:
startcodon = refseqstart // 3 + 1
codonshift = 1
nmuts = 0
for icodon in range(startcodon, refseqend // 3 + 1):
mutcodon = subamplicon[3 * (icodon - startcodon) + codonshift :
3 * (icodon - startcodon) + 3 + codonshift]
if ('N' not in mutcodon) and (mutcodon !=
refseq[3 * icodon - 3 : 3 * icodon]):
nmuts += 1
if nmuts > maxmuts:
return False
else:
raise ValueError("Invalid chartype")
return subamplicon
[docs]def incrementCounts(refseqstart, subamplicon, chartype, counts):
"""Increment counts dict based on an aligned subamplicon.
This is designed for keeping track of counts of different
mutations / identities when aligning many subamplicons to
a sequence.
Any positions where `subamplicon` has an ``N`` are ignored,
and not added to `counts`.
Args:
`refseqstart` (int)
First nucleotide position in 1, 2, ... numbering
where `subamplicon` aligns.
`subamplicon` (str)
The subamplicon.
`chartype` (str)
Character type for which we are counting mutations.
Currently, only allowable value is 'codon'.
`counts` (dict)
Stores counts of identities, and is incremented by
this function. Is a dict keyed by every possible
character (e.g., codon), with values lists with
element `i` holding the counts for position `i`
in 0, 1, ... numbering.
Returns:
On completion, `counts` has been incremented.
>>> codonlen = 10
>>> counts = dict([(codon, [0] * codonlen) for codon
... in CODONS])
>>> subamplicon1 = 'ATGGACTTTC'
>>> incrementCounts(1, subamplicon1, 'codon', counts)
>>> subamplicon2 = 'GGTCTTTCCCGGN'
>>> incrementCounts(3, subamplicon2, 'codon', counts)
>>> counts['ATG'][0] == 1
True
>>> counts['GAC'][1] == 1
True
>>> counts['GTC'][1] == 1
True
>>> counts['TTT'][2] == 2
True
>>> counts['CCC'][3] == 1
True
>>> sum([sum(c) for c in counts.values()]) == 6
True
"""
if chartype == 'codon':
if refseqstart % 3 == 1:
startcodon = (refseqstart + 2) // 3 - 1
codonshift = 0
elif refseqstart % 3 == 2:
startcodon = (refseqstart + 1) // 3
codonshift = 2
elif refseqstart % 3 == 0:
startcodon = refseqstart // 3
codonshift = 1
else:
raise ValueError("Invalid chartype")
shiftedsubamplicon = subamplicon[codonshift : ]
for i in range(len(shiftedsubamplicon) // 3):
codon = shiftedsubamplicon[3 * i : 3 * i + 3]
if 'N' not in codon:
counts[codon][startcodon + i] += 1
[docs]def codonToAACounts(counts):
"""Makes amino-acid counts `pandas.DataFrame` from codon counts.
Args:
`counts` (`pandas.DataFrame`)
Columns are the string `site` `wildtype` and all codons
in `CODONS`. Additional columns are allowed
but ignored.
Returns:
`aacounts` (`pandas.DataFrame`)
Columns are the string `site` and all amino acids
in `AAS_WITHSTOP` with counts for each
amino acid made by summing counts for encoding codons.
>>> d = {'site':[1, 2], 'othercol':[0, 0], 'ATG':[105, 1],
... 'GGG':[3, 117], 'GGA':[2, 20], 'TGA':[0, 1],
... 'wildtype':['ATG', 'GGG']}
>>> for codon in CODONS:
... if codon not in d:
... d[codon] = [0, 0]
>>> counts = pandas.DataFrame(d)
>>> aacounts = codonToAACounts(counts)
>>> 'othercol' in aacounts.columns
False
>>> all(aacounts['site'] == [1, 2])
True
>>> all(aacounts['wildtype'] == ['M', 'G'])
True
>>> all(aacounts['M'] == [105, 1])
True
>>> all(aacounts['G'] == [5, 137])
True
>>> all(aacounts['*'] == [0, 1])
True
>>> all(aacounts['V'] == [0, 0])
True
"""
d = dict([(key, []) for key in ['site', 'wildtype'] +
AAS_WITHSTOP])
for (i, row) in counts.iterrows():
d['site'].append(row['site'])
d['wildtype'].append(CODON_TO_AA[row['wildtype']])
for aa in AAS_WITHSTOP:
d[aa].append(0)
for c in CODONS:
d[CODON_TO_AA[c]][-1] += (row[c])
return pandas.DataFrame(d)
[docs]def annotateCodonCounts(counts):
"""Gets annotated `pandas.DataFrame` from codon counts.
Some of the programs (e.g., `dms2_bcsubamplicons`) create
``*_codoncounts.csv`` files when run with ``--chartype codon``.
These CSV files have columns indicating the `site` and `wildtype`
codon, as well as a column for each codon giving the counts for that
codon. This function reads that file (or a `pandas.DataFrame` read
from it) to return a `pandas.DataFrame` where a variety of additional
useful annotations have been added.
Args:
`counts` (str)
Name of existing codon counts CSV file, or `pandas.DataFrame`
holding counts.
Returns:
`df` (`pandas.DataFrame`)
The DataFrame with the information in `counts` plus
the following added columns for each site:
`ncounts` : number of counts at site
`mutfreq` : mutation frequency at site
`nstop` : number of stop-codon mutations
`nsyn` : number of synonymous mutations
`nnonsyn` : number of nonsynonymous mutations
`n1nt` : number of 1-nucleotide codon mutations
`n2nt` : number of 2-nucleotide codon mutations
`n3nt` : number of 3-nucleotide codon mutations
`AtoC`, `AtoG`, etc : number of each nucleotide mutation
type among codon mutations with **one** nucleotide change.
`mutfreq1nt`, `mutfreq2nt`, `mutfreq3nt` : frequency
of 1-, 2-, and 3-nucleotide codon mutations at site.
>>> d = {'site':[1, 2], 'wildtype':['ATG', 'GGG'], 'ATG':[105, 1],
... 'GGG':[3, 117], 'GGA':[2, 20], 'TGA':[0, 1]}
>>> for codon in CODONS:
... if codon not in d:
... d[codon] = [0, 0]
>>> counts = pandas.DataFrame(d)
>>> with tempfile.NamedTemporaryFile(mode='w') as f:
... counts.to_csv(f, index=False)
... f.flush()
... df = annotateCodonCounts(f.name)
>>> all([all(df[col] == counts[col]) for col in counts.columns])
True
>>> all(df['ncounts'] == [110, 139])
True
>>> all(df['mutfreq'] == [5 / 110., 22 / 139.])
True
>>> all(df['nstop'] == [0, 1])
True
>>> all(df['nsyn'] == [0, 20])
True
>>> all(df['nnonsyn'] == [5, 1])
True
>>> all(df['n1nt'] == [0, 20])
True
>>> all(df['n2nt'] == [3, 2])
True
>>> all(df['n3nt'] == [2, 0])
True
>>> all(df['GtoA'] == [0, 20])
True
>>> all(df['AtoC'] == [0, 0])
True
>>> all(df['mutfreq1nt'] == [0, 20 / 139.])
True
>>> all(df['mutfreq3nt'] == [2 / 110., 0])
True
"""
if isinstance(counts, str):
df = pandas.read_csv(counts)
elif isinstance(counts, pandas.DataFrame):
df = counts.copy()
else:
raise ValueError("invalid counts")
assert set(CODONS) <= set(df.columns), \
"Did not find counts for all codons".format(counts)
df['ncounts'] = df[CODONS].sum(axis=1)
df['mutfreq'] = (((df['ncounts'] - df.lookup(df['wildtype'].index,
df['wildtype'].values)) / df['ncounts'].astype('float'))
.fillna(0))
ntchanges = ['{0}to{1}'.format(nt1, nt2) for nt1 in dms_tools2.NTS
for nt2 in dms_tools2.NTS if nt1 != nt2]
nstoplist = []
nsynlist = []
nnonsynlist = []
nXntlists = dict([(n + 1, []) for n in range(3)])
nntchangeslists = dict([(ntchange, []) for ntchange in ntchanges])
for (i, row) in df.iterrows():
nstop = nsyn = nnonsyn = 0
nXnt = dict([(n + 1, 0) for n in range(3)])
nntchanges = dict([(ntchange, 0) for ntchange in ntchanges])
wt = row['wildtype']
wtaa = CODON_TO_AA[wt]
for c in CODONS:
if c == wt:
continue
aa = CODON_TO_AA[c]
if aa == '*':
nstop += row[c]
elif aa == wtaa:
nsyn += row[c]
else:
nnonsyn += row[c]
ntdiffs = ['{0}to{1}'.format(nt1, nt2) for (nt1, nt2)
in zip(wt, c) if nt1 != nt2]
nXnt[len(ntdiffs)] += row[c]
if len(ntdiffs) == 1:
nntchanges[ntdiffs[0]] += row[c]
nstoplist.append(nstop)
nsynlist.append(nsyn)
nnonsynlist.append(nnonsyn)
for n in range(3):
nXntlists[n + 1].append(nXnt[n + 1])
for ntchange in ntchanges:
nntchangeslists[ntchange].append(nntchanges[ntchange])
df = df.assign(nstop=nstoplist, nsyn=nsynlist, nnonsyn=nnonsynlist)
df = df.assign(n1nt=nXntlists[1], n2nt=nXntlists[2], n3nt=nXntlists[3])
for ntchange in ntchanges:
df[ntchange] = nntchangeslists[ntchange]
for nnt in range(3):
df['mutfreq{0}nt'.format(nnt + 1)] = (df['n{0}nt'.format(nnt + 1)]
/ df['ncounts'].astype('float')).fillna(0)
return df
[docs]def adjustErrorCounts(errcounts, counts, charlist, maxexcess):
"""Adjust error counts to not greatly exceed counts of interest.
This function is useful when estimating preferences. Under the
model, the error-control should not have a higher rate of error
than the actual sample. However, this could happen if the experimental
data don't fully meet the assumptions. So this function scales
down the error counts in that case.
Args:
`errcounts` (pandas.DataFrame)
Holds counts for error control.
`counts` (pandas.DataFrame)
Holds counts for which we are correcting errors.
`charlist` (list)
Characters for which we have counts.
`maxexcess` (int)
Only let error-control counts exceed actual by this much.
Returns:
A copy of `errcounts` except for any non-wildtype character,
the maximum frequency of that character is adjusted to be
at most the number predicted by the frequency in `counts`
plus `maxexcess`.
>>> counts = pandas.DataFrame({'site':[1], 'wildtype':['A'],
... 'A':500, 'C':10, 'G':40, 'T':20})
>>> errcounts = pandas.DataFrame({'site':[1], 'wildtype':['A'],
... 'A':250, 'C':1, 'G':30, 'T':10})
>>> charlist = ['A', 'C', 'G', 'T']
>>> errcounts = errcounts[['site', 'wildtype'] + charlist]
>>> adj_errcounts = adjustErrorCounts(errcounts, counts, charlist, 1)
>>> set(adj_errcounts.columns) == set(errcounts.columns)
True
>>> all(adj_errcounts['site'] == errcounts['site'])
True
>>> all(adj_errcounts['wildtype'] == errcounts['wildtype'])
True
>>> (adj_errcounts[adj_errcounts['site'] == 1][charlist].values[0]
... == numpy.array([250, 1, 21, 10])).all()
True
"""
cols = counts.columns
counts = counts.sort_values('site')
errcounts = errcounts.sort_values('site')
assert all(counts['site'] == errcounts['site'])
assert all(counts['wildtype'] == errcounts['wildtype'])
counts['total'] = counts[charlist].sum(axis=1).astype('float')
errcounts['total'] = errcounts[charlist].sum(axis=1)
maxallowed = (counts[charlist].div(counts['total'], axis=0).multiply(
errcounts['total'], axis=0) + maxexcess).round().astype('int')
adj_errcounts = errcounts[charlist].where(errcounts[charlist] < maxallowed,
maxallowed[charlist])
for c in charlist:
adj_errcounts[c] = adj_errcounts[c].where(counts['wildtype'] != c,
errcounts[c])
for col in cols:
if col not in charlist:
adj_errcounts[col] = counts[col]
return adj_errcounts[cols]
[docs]def renumberSites(renumbfile, infiles, missing='error',
outfiles=None, outprefix=None, outdir=None):
"""Renumber sites in CSV files.
Switch numbering scheme in files with a column named `site`.
You must specify **exactly one** of `outfiles`,
`outprefix`, and `outdir` as something other than `None`.
Args:
`renumbfile` (str)
Name of existing CSV file with the re-numbering scheme.
Should have columns with name `original` and `new`.
Each entry in `original` should refer to a site in
the input files, and each entry in `new` should be
the new number for this site. If an entry in `new`
is `None` or `nan` then it is dropped from the newly
numbered files regardless of `missing`.
`infiles` (list)
List of existing CSV files that we are re-numbering.
Each file must have an entry of `site`.
`missing` (str)
How to handle sites in `infiles` but not `renumbfile`.
- `error`: raise an error
- `skip`: skip renumbering, leave with original number
- `drop`: drop any sites not in `renumbfile`
`outfiles` (list)
List of output files of the same length as `infiles`.
The numbered version of `infiles` is named as the
corresponding entry in `outfiles`.
`outdir` (str)
A directory name. The renumbered files have the same
names as in `infile`, but are now placed in `outdir`.
`outprefix` (str)
The renumbered files have the same names and locations
as `infiles`, but have the pre-pended filename extension
`outprefix`.
"""
assert os.path.isfile(renumbfile), "no renumbfile {0}".format(renumbfile)
renumb = pandas.read_csv(renumbfile)
assert {'original', 'new'} <= set(renumb.columns), \
"renumbfile lacks columns `original` and/or `new`"
for col in ['original', 'new']:
assert len(renumb[col]) == len(set(renumb[col])), \
"duplicate sites for {0} in {1}".format(col, renumbfile)
renumb[col] = renumb[col].astype('str')
assert isinstance(infiles, list), "infiles is not a list"
nin = len(infiles)
infiles = [os.path.abspath(f) for f in infiles]
assert len(set(infiles)) == nin, "duplicate files in `infiles`"
if outfiles is not None:
assert isinstance(outfiles, list), "`outfiles` not list"
assert (outdir is None) and (outprefix is None), \
"only specify one of `outfiles`, `outdir`, and `outprefix`"
nout = len(outfiles)
assert nout == nin, "`outfiles` and `infiles` different length"
elif outdir is not None:
assert isinstance(outdir, str), "`outdir` should be string"
assert (outfiles is None) and (outprefix is None), \
"only specify one of `outfiles`, `outdir`, and `outprefix`"
if not os.path.isdir(outdir):
os.mkdir(outdir)
outfiles = [os.path.join(outdir, os.path.basename(f))
for f in infiles]
elif outprefix is not None:
assert isinstance(outprefix, str), "`outdir` should be string"
assert (outfiles is None) and (outdir is None), \
"only specify one of `outfiles`, `outdir`, and `outprefix`"
outfiles = [os.path.join(os.path.dirname(f), outprefix +
os.path.basename(f)) for f in infiles]
else:
raise ValueError("specify `outdir`, `outprefix`, `outfiles`")
outfiles = [os.path.abspath(f) for f in outfiles]
assert len(set(outfiles)) == len(outfiles), "duplicate files in `outfiles`"
assert not set(outfiles).intersection(set(infiles)), \
"some in and outfiles the same"
for (fin, fout) in zip(infiles, outfiles):
df_in = pandas.read_csv(fin)
assert 'site' in df_in.columns, "no `site` column in {0}".format(fin)
df_in['site'] = df_in['site'].astype('str')
if missing == 'error':
if set(df_in['site']) > set(renumb['original']):
raise ValueError("`missing` is `error`, excess sites in {0}"
.format(fin))
elif missing == 'skip':
pass
elif missing == 'drop':
df_in = df_in[df_in['site'].isin(renumb['original'])]
else:
raise ValueError("invalid `missing` of {0}".format(missing))
# can't just use replace below because of this bug:
# https://github.com/pandas-dev/pandas/issues/16051
unmappedsites = df_in[~df_in['site'].isin(renumb['original'])]['site']
replacemap = dict(zip(
renumb['original'].append(unmappedsites),
renumb['new'].append(unmappedsites)))
df_in['site'] = df_in['site'].map(replacemap)
df_in = (df_in[df_in['site'].notnull()]
.query('site != "NaN"')
.query('site != "nan"')
.query('site != "None"')
)
df_in.to_csv(fout, index=False)
[docs]def codonEvolAccessibility(seqs):
"""Accessibility of amino acids by nucleotide mutations.
Args:
`seqs` (str or list)
A single coding sequence or a list of such sequences.
Returns:
A pandas DataFrame listing all sites in the sequence(s)
numbered 1, 2, ..., with columns giving the accessibility
of each amino acid by single nucleotide mutations.
The accessibility of codon :math:`c` to amino-acid :math:`a`
by single-nucleotide mutations is defined as the minimum
number of nucleotide mutations needed to generate that
amino-acid.
For a collection of sequences, we calculate the
accessibility as the weighted average of the accessibilities
of all codons observed at that site in the collection of
sequences.
As an example, compute accessibility for one sequence:
>>> s = "ATGGGA"
>>> acc = codonEvolAccessibility(s)
The returned pandas DataFrame `acc` is has a column named
`site` plus columns for all amino acids:
>>> all(acc.columns == ['site'] + AAS_WITHSTOP)
True
We look at entries for a few amino acids. At the first
site, the wildtype entry in the sequence `s` is the codon
for *M* (``ATG``). So at this site, the distance to *M*
is 0. The distance to *I* (which has codon ``ATA`` as a
codon) is 1, and the distance to *W* (which has only ``TGG``
as a codon) is 2.
>>> acc[['site', 'G', 'I', 'M', 'W']]
site G I M W
0 1 2.0 1.0 0.0 2.0
1 2 0.0 2.0 3.0 2.0
If we pass the function a list of multiple sequences,
then the accessibilities are averaged over the sequences:
>>> acc2 = codonEvolAccessibility(['ATGGGA', 'ATAGGA'])
>>> acc2[['site', 'G', 'I', 'M', 'W']]
site G I M W
0 1 2.0 0.5 0.5 2.5
1 2 0.0 2.0 3.0 2.0
"""
# get number of nucleotide diffs between all pairs of codons
nt_diffs = dict([
((c1, c2), sum(1 for x1, x2 in zip(c1, c2) if x1 != x2))
for c1, c2 in itertools.product(CODONS, repeat=2)])
# get number of nucleotide diffs to nearest codon for amino acid
aa_nt_diffs = {}
for c in CODONS:
for aa, othercs in AA_TO_CODONS.items():
aa_nt_diffs[(c, aa)] = min([nt_diffs[(c, c2)]
for c2 in othercs])
# make sure seqs are of same valid length
if isinstance(seqs, str):
seqs = [seqs]
assert len(seqs[0]) % 3 == 0, "seqs not of length divisible by 3"
assert all([len(seqs[0]) == len(s) for s in seqs[1 : ]]), \
"seqs not all of same length"
# get nucleotide distances, summing for all sequences
dists = collections.defaultdict(lambda: collections.defaultdict(float))
for s in seqs:
for r in range(len(s) // 3):
c = s[3 * r : 3 * r + 3]
assert c in CODONS, "invalid codon {0}".format(c)
for aa in AAS_WITHSTOP:
dists[r + 1][aa] += aa_nt_diffs[(c, aa)]
return (pandas.DataFrame.from_dict(dists, orient='index')
.rename_axis('site')
[AAS_WITHSTOP]
/ len(seqs)).reset_index()
[docs]def sigFigStr(x, nsig):
"""Get str of `x` with `nsig` significant figures.
>>> sigFigStr(11190, 2)
'11000'
>>> sigFigStr(117, 2)
'120'
>>> sigFigStr(6, 2)
'6.0'
>>> sigFigStr(0.213, 2)
'0.21'
>>> sigFigStr(0.007517, 3)
'0.00752'
"""
if x <= 0:
raise ValueError('currently only handles numbers > 0')
x = float(f"{{:.{nsig}g}}".format(x))
if x >= 10**(nsig - 1):
return '{:d}'.format(round(x))
else:
predecimal = math.floor(math.log10(x)) + 1
postdecimal = nsig - predecimal
assert postdecimal > 0, str(x)
return f"{{:.{postdecimal}f}}".format(x)
[docs]def getSubstitutions(wildtype, mutant, amino_acid=False):
"""Get space delimited string of substitutions
Args:
`wildtype` (str):
The wildtype sequence
`mutant` (str):
The mutant sequence
`amino_acid` (bool)
Specify whether the sequence is amino acid.
Default is False
Returns:
A space delimited string of substitutions present in the
mutant sequence
>>> getSubstitutions('AGT', 'TGT')
'A1T'
>>> getSubstitutions('AAGTAACGA', 'ATCTAACGA')
'A2T G3C'
>>> getSubstitutions('TYARV', 'GYAGV', amino_acid=True)
'T1G R4G'
"""
if len(wildtype) != len(mutant):
raise ValueError('wildtype and mutant must be same length')
subs = []
for site in range(len(wildtype)):
wt = wildtype[site]
mut = mutant[site]
if amino_acid:
if wt not in AAS_WITHSTOP:
raise ValueError (f"Invalid wt residue {wt} at site {site+1}")
if mut not in AAS_WITHSTOP:
raise ValueError (f"Invalid mutant residue {mut} at site {site+1}")
else:
if wt not in NTS:
raise ValueError (f"Invalid wt nucleotide {wt} at site {site+1}")
if mut not in NTS:
raise ValueError (f"Invalid mutant nucleotide {mut} at site {site+1}")
if wt!=mut:
pos = str(site + 1)
subs.append(f"{wt}{pos}{mut}")
subs = ' '.join(subs)
return subs
[docs]def codon_to_nt_counts(codoncounts):
"""Convert codon counts file to nucleotide counts.
Args:
`codoncounts` (str or pandas.DataFrame)
Codon counts in format produced by ``dms2_bcsubamp``,
either as CSV file or data frame holding CSV.
Returns:
pandas.DataFrame with nucleotide counts.
Example:
>>> with tempfile.NamedTemporaryFile('w') as f:
... _ = f.write(textwrap.dedent('''
... site,wildtype,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,AGA,AGC,AGG,AGT,ATA,ATC,ATG,ATT,CAA,CAC,CAG,CAT,CCA,CCC,CCG,CCT,CGA,CGC,CGG,CGT,CTA,CTC,CTG,CTT,GAA,GAC,GAG,GAT,GCA,GCC,GCG,GCT,GGA,GGC,GGG,GGT,GTA,GTC,GTG,GTT,TAA,TAC,TAG,TAT,TCA,TCC,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
... 1,ATG,0,0,0,0,0,0,2,0,0,0,0,0,8,0,333985,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
... 2,AAG,16,20,333132,41,13,12,27,14,8,6,67,8,9,13,29,9,10,11,12,8,10,15,15,11,6,9,3,7,8,10,17,4,3,7,49,7,9,14,9,4,10,7,7,7,9,11,11,5,14,14,11,6,13,16,15,14,9,9,15,8,9,11,8,15
... 3,GCA,2,3,8,3,34,11,7,6,7,6,9,8,4,3,5,0,6,14,10,12,6,8,7,10,5,11,7,6,6,1,3,12,19,6,11,9,333250,10,6,9,15,3,5,5,37,9,9,7,8,4,8,3,23,5,7,8,6,11,7,10,7,9,3,6
... '''.strip()))
... f.flush()
... nt_counts = codon_to_nt_counts(f.name)
>>> nt_counts
site wildtype A C G T
0 1 A 334009 0 6 0
1 2 T 0 2 0 334013
2 3 G 8 0 333993 14
3 4 A 333424 156 169 187
4 5 A 333361 211 186 178
5 6 G 156 185 333427 168
6 7 G 116 124 333410 125
7 8 C 126 333407 121 121
8 9 A 333435 114 112 114
"""
if not isinstance(codoncounts, pandas.DataFrame):
codoncounts = pandas.read_csv(codoncounts)
if codoncounts['site'].dtype != int:
raise ValueError('`site` column in `codoncounts` must be integer')
nt_counts = []
for i_nt in [0, 1, 2]:
nt_counts.append(
codoncounts
.melt(id_vars=['site', 'wildtype'],
var_name='codon',
value_name='count',
)
.assign(
site=lambda x: 3 * (x['site'] - 1) + i_nt + 1,
wildtype=lambda x: x['wildtype'].str[i_nt],
nucleotide=lambda x: x['codon'].str[i_nt],
)
.groupby(['site', 'wildtype', 'nucleotide'])
.aggregate({'count': 'sum'})
.reset_index()
.pivot_table(values='count',
columns='nucleotide',
index=['site', 'wildtype'])
.reset_index()
)
nt_counts = (pandas.concat(nt_counts)
.sort_values('site')
.reset_index(drop=True)
)
del nt_counts.columns.name
return nt_counts
[docs]def barcodeInfoToCodonVariantTable(samples, geneseq, path=None):
"""Convert barcode info files into a CodonVariantTable
Convert barcode info files output from `dms2_bcsubamp` into a
`CodonVariantTable`. Barcode info files contain reads and barcodes from
barcoded subamplicon sequencing, described
`here <https://jbloomlab.github.io/dms_tools2/bcsubamp.html>`_.
This function takes consensus reads retained by `dms2_bcsubamp`,
gives each unique sequence a numerical barcode (since the barcodes from
`dms2_bcsubamp` could come from the same variant), and counts the number
of retained consensus reads corresponding to each sequence. Then, a
`CodonVariantTable` is made using the sequences and their numerical
barcodes, and counts are added based on the number of retained consensus
reads of those sequences. Therefore, the `CodonVariantTable` will only
contain one 'variant' for each unique sequence with the total count for all
the unbarcoded variants in the experiment which had the same sequence.
Args:
`samples` (dict):
Dictionary with libraries as keys and lists of info file prefixes
(file names without the '_bcinfo.txt.gz') for files corresponding
to those libraries as values.
Example: {'library-1':['condition-1-library-1'],
'library-2':['condition-1-library-2']}
`geneseq` (str):
The wildtype gene sequence
`path` (str)
Directory in which barcode info files are located
Returns:
A `dms_variants.codonvarianttable.CodonVariantTable` with 'counts'
generated from the barcode info files
"""
# Set up re matchers for looking at lines
matcher = re.compile(r'(?P<linetype>^.*\:) '
r'(?P<contents>.*$)')
alt_matcher = re.compile(r'(?P<linetype>^R\d READS:$)')
read_matcher = re.compile(r'(?P<read>^[ATGCN\s]*$)')
# Create a dictionary to contain dictionaries of each library's barcodes
libraries = {}
# Initialize lists for making the codonvarianttable
barcodes = []
subs = []
variant_call_support = []
library_list = []
# For each library, go through each sample file and collect data
for library in samples.keys():
# Initialize dictionary to contain this library's reads and barcodes
barcode_dictionary = {}
# Start a barcode count for this library
cur_barcode = 1
# For each barcode info file corresponding to a sample in this library
for sample in samples[library]:
# Set initial conditions
take_next = False
description_skipped = False
# Find the file
f = f"{sample}_bcinfo.txt.gz"
if path:
file_path = os.path.join(os.path.abspath(path), f)
else:
file_path = f
# Open the file and loop through it to find retained consensus
# reads and give them each a new barcode
with gzip.open(file_path, 'r') as f:
# Make sure the first line looks like it is supposed to
firstline = f.readline()
firstline = firstline.decode()
first_match = matcher.match(firstline)
if first_match.group('linetype') != 'BARCODE:':
raise ValueError(f"Unexpected first line {firstline}: may be "
"unexpected file type")
else:
previous_line = first_match
# Go through the lines, making they are in the expected order
for line in f:
line = line.decode()
line_match = matcher.match(line)
if not line_match:
line_match = alt_matcher.match(line)
if not line_match:
read_match = read_matcher.match(line)
if not read_match:
raise ValueError(f"Unable to recognize line {line}")
else:
line_is_read = True
previous_linetype = previous_line.group('linetype')
if previous_linetype != 'R1 READS:' and \
previous_linetype != 'R2 READS:':
raise ValueError(f"Unexpected line {line}")
else:
line_is_read = False
if previous_line.group('linetype') == 'BARCODE:':
if line_match.group('linetype') != 'RETAINED:':
raise ValueError(f"Unexpected line {line}")
# Decide whether to retain the next consensus or not
else:
if line_match.group('contents') == 'False':
retain = False
elif line_match.group('contents') == 'True':
retain = True
else:
raise ValueError(f"Unexpected line {line}")
elif previous_line.group('linetype') == 'RETAINED:':
if line_match.group('linetype') != 'DESCRIPTION:':
raise ValueError(f"Unexpected line {line}")
elif previous_line.group('linetype') == 'DESCRIPTION:':
if line_match.group('linetype') != 'CONSENSUS:':
raise ValueError(f"Unexpected line {line}")
# Make sure we know whether to retain or not
elif not isinstance(retain, bool):
raise ValueError(
f"Unclear whether to retain {line_match.group('contents')}"
)
elif retain:
read = line_match.group('contents')
# Add the read to the dictionary if not in it
# Also give it a barcode
if 'N' not in read:
if read not in barcode_dictionary:
# Create the sequence in the dictionary
barcode_dictionary[read] = {}
# Give it an initial count of 1 for this sample
barcode_dictionary[read][sample] = 1
# Give it the next barcode
barcode_dictionary[read]['barcode'] = cur_barcode
# Save values for making CodonVariantTable
barcodes.append(cur_barcode)
subs.append(getSubstitutions(geneseq, read))
variant_call_support.append(1)
library_list.append(library)
# Advance current barcode
cur_barcode += 1
else:
# Add a counter for the sample if sequence
# not seen for this sample yet
if sample not in barcode_dictionary[read]:
barcode_dictionary[read][sample] = 1
else:
# Add another count to this read for
# this sample
barcode_dictionary[read][sample] += 1
# Set retain to None
retain = None
elif previous_line.group('linetype') == 'CONSENSUS:':
if line_match.group('linetype') != 'R1 READS:':
raise ValueError(f"Unexpected line {line}")
elif previous_line.group('linetype') == 'R1 READS:':
if not line_is_read:
if line_match.group('linetype') != 'R2 READS:':
raise ValueError(f"Unexpected line {line}")
elif previous_line.group('linetype') == 'R2 READS:':
if not line_is_read:
if line_match.group('linetype') != 'BARCODE:':
raise ValueError(f"Unexpected line {line}")
# Save this line as the previous line if it is not a read
if not line_is_read:
previous_line = line_match
# After going through each file for a library, save its dictionary with
# reads and barcodes
libraries[library] = barcode_dictionary
# Make the dataframe for creating the codonvarianttable
df = {'barcode':barcodes,
'substitutions':subs,
'library':library_list,
'variant_call_support':variant_call_support,
}
df = pandas.DataFrame(df)
# Make the codonvarianttable
with tempfile.NamedTemporaryFile(mode='w') as f:
df.to_csv(f, index=False)
f.flush()
variants = dms_variants.codonvarianttable.CodonVariantTable(
barcode_variant_file=f.name,
geneseq=geneseq)
# Make the counts dataframe:
# Initialize list of dataframes
dfs = []
# Loop through each library and produce count dataframes for each sample
for library in libraries:
barcode_dictionary = libraries[library]
for sample in samples[library]:
barcodes_list = []
counts_list = []
sample_list = []
library_list = []
# Get counts for this sample
for sequence in barcode_dictionary.keys():
if sample not in barcode_dictionary[sequence].keys():
counts_list.append(0)
else:
counts_list.append(barcode_dictionary[sequence][sample])
barcodes_list.append(barcode_dictionary[sequence]['barcode'])
sample_list.append(sample)
library_list.append(library)
# Make a dataframe for this sample
data = {'barcode':barcodes_list,
'count':counts_list,
'sample':sample_list,
'library':library_list,
}
data = pandas.DataFrame(data)
# Append it to the list of dataframes
dfs.append(data)
# Concatenate the list of dataframes into a counts dataframe
barcode_counts = pandas.concat(dfs)
# Add the counts for each sample to the codonvarianttable
for library in libraries:
for sample in samples[library]:
icounts = barcode_counts.query('library == @library & sample == @sample')
icounts = icounts[['barcode', 'count']]
variants.addSampleCounts(library, sample, icounts)
return(variants)
if __name__ == '__main__':
import doctest
doctest.testmod()