Source code for dms_tools2.utils

"""
===================
utils
===================

Miscellaneous utilities for ``dms_tools2``.
"""


import os
import math
import sys
import time
import platform
import importlib
import logging
import tempfile
import textwrap
import itertools
import collections
import random
import re

import pysam
import numpy
import scipy.misc
import scipy.special
import pandas
import gzip

import dms_variants.codonvarianttable

import dms_tools2
from dms_tools2 import CODONS, CODON_TO_AA, AAS_WITHSTOP, AA_TO_CODONS, NTS
import dms_tools2._cutils


[docs]def sessionInfo():
    """Returns string with information about session / packages."""
    s = [
            'Version information:',
            '\tTime and date: {0}'.format(time.asctime()),
            '\tPlatform: {0}'.format(platform.platform()),
            '\tPython version: {0}'.format(
                    sys.version.replace('\n', ' ')),
            '\tdms_tools2 version: {0}'.format(dms_tools2.__version__),
            ]
    for modname in ['Bio', 'pandas', 'numpy', 'IPython',
            'jupyter', 'matplotlib', 'plotnine', 'natsort', 'pystan',
            'scipy', 'seaborn', 'phydmslib', 'statsmodels', 'rpy2',
            'regex', 'umi_tools']:
        try:
            v = importlib.import_module(modname).__version__
            s.append('\t{0} version: {1}'.format(modname, v))
        except AttributeError:
            s.append('\t{0} version unknown'.format(modname))
        except ImportError:
            s.append("\t{0} cannot be imported".format(modname))
    return '\n'.join(s)


[docs]def initLogger(logfile, prog, args):
    """Initialize output logging for scripts.

    Args:
        `logfile` (str or `sys.stdout`)
            Name of file to which log is written, or 
            `sys.stdout` if you just want to write information
            to standard output.
        `prog` (str)
            Name of program for which we are logging.
        `args` (dict)
            Program arguments as arg / value pairs.

    Returns:
        If `logfile` is a string giving a file name, returns
        an opened and initialized `logging.Logger`. If `logfile`
        is `sys.stdout`, then writes information to `sys.stdout`.
        In either case, basic information is written about the program 
        and args.
    """
    if logfile == sys.stdout:
        logfile.write("Beginning execution of {0} in directory {1}\n\n".format(
                prog, os.getcwd()))
        logfile.write("{0}\n\n".format(sessionInfo()))
        logfile.write("Parsed the following arguments:\n\t{0}\n\n".format(
                '\n\t'.join(['{0} = {1}'.format(arg, val) for (arg, val)
                in args.items()])))
    else:
        if os.path.isfile(logfile):
            os.remove(logfile)
        logging.basicConfig(level=logging.INFO,
                format='%(asctime)s - %(levelname)s - %(message)s')
        logger = logging.getLogger(prog)
        logfile_handler = logging.FileHandler(logfile)
        logger.addHandler(logfile_handler)
        formatter = logging.Formatter(
                '%(asctime)s - %(levelname)s - %(message)s')
        logfile_handler.setFormatter(formatter)
        try:
            logger.info("Beginning execution of {0} in directory {1}\n"
                .format(prog, os.getcwd()))
            logger.info("Progress is being logged to {0}".format(logfile))
            logger.info("{0}\n".format(sessionInfo()))
            logger.info("Parsed the following arguments:\n\t{0}\n".format(
                    '\n\t'.join(['{0} = {1}'.format(arg, val) for (arg, val)
                    in args.items()])))
        except:
            logger.exception("Error")
            raise

        return logger


[docs]def iteratePairedFASTQ(r1files, r2files, r1trim=None, r2trim=None):
    """Iterates over FASTQ files for single or paired-end sequencing.

    Args:
        `r1files` (list or str)
            Name of R1 FASTQ file or list of such files. Can optionally
            be gzipped.
        `r2files` (list or str or `None`)
            Like `r1files` but for R2 files, or `None` if no R2.
        `r1trim` (int or `None`)
            If not `None`, trim `r1` and `q1` to be no longer than this.
        `r2trim` (int or `None`)
            Like `r1trim` but for R2.

    Returns:
        Each iteration returns `(name, r1, r2, q1, q2, fail)` where:

            - `name` is a string giving the read name

            - `r1` and `r2` are strings giving the reads; `r2`
              is `None` if no R2.

            - `q1` and `q2` are strings giving the PHRED Q scores;
              `q2` is none if no R2.

            - `fail` is `True` if either read failed Illumina chastity
              filter, `False` if both passed, `None` if info not present.

    We run a simple test by first writing an example FASTQ file and
    then testing on it.

    >>> n1_1 = '@DH1DQQN1:933:HMLH5BCXY:1:1101:2165:1984 1:N:0:CGATGT'
    >>> r1_1 = 'ATGCAATTG'
    >>> q1_1 = 'GGGGGIIII'
    >>> n2_1 = '@DH1DQQN1:933:HMLH5BCXY:1:1101:2165:1984 2:N:0:CGATGT'
    >>> r2_1 = 'CATGCATA'
    >>> q2_1 = 'G<GGGIII'
    >>> tf = tempfile.NamedTemporaryFile
    >>> with tf(mode='w') as r1file, tf(mode='w') as r2file:
    ...     _ = r1file.write('\\n'.join([
    ...             n1_1, r1_1, '+', q1_1,
    ...             n1_1.replace(':N:', ':Y:'), r1_1, '+', q1_1,
    ...             n1_1.split()[0], r1_1, '+', q1_1,
    ...             ]))
    ...     r1file.flush()
    ...     _ = r2file.write('\\n'.join([
    ...             n2_1, r2_1, '+', q2_1,
    ...             n2_1, r2_1, '+', q2_1,
    ...             n2_1, r2_1, '+', q2_1,
    ...             ]))
    ...     r2file.flush()
    ...     itr = iteratePairedFASTQ(r1file.name, r2file.name, r1trim=4, r2trim=5)
    ...     next(itr) == (n1_1.split()[0][1 : ], r1_1[ : 4],
    ...             r2_1[ : 5], q1_1[ : 4], q2_1[ : 5], False)
    ...     next(itr) == (n1_1.split()[0][1 : ], r1_1[ : 4],
    ...             r2_1[ : 5], q1_1[ : 4], q2_1[ : 5], True)
    ...     next(itr) == (n1_1.split()[0][1 : ], r1_1[ : 4],
    ...             r2_1[ : 5], q1_1[ : 4], q2_1[ : 5], None)
    True
    True
    True

    Now do the same test but for just R1:

    >>> with tf(mode='w') as r1file:
    ...     _ = r1file.write('\\n'.join([
    ...             n1_1, r1_1, '+', q1_1,
    ...             n1_1.replace(':N:', ':Y:'), r1_1, '+', q1_1,
    ...             n1_1.split()[0], r1_1, '+', q1_1,
    ...             ]))
    ...     r1file.flush()
    ...     itr_R1 = iteratePairedFASTQ(r1file.name, None, r1trim=4)
    ...     next(itr_R1) == (n1_1.split()[0][1 : ], r1_1[ : 4],
    ...             None, q1_1[ : 4], None, False)
    ...     next(itr_R1) == (n1_1.split()[0][1 : ], r1_1[ : 4],
    ...             None, q1_1[ : 4], None, True)
    ...     next(itr_R1) == (n1_1.split()[0][1 : ], r1_1[ : 4],
    ...             None, q1_1[ : 4], None, None)
    True
    True
    True

    """
    if isinstance(r1files, str):
        r1files = [r1files]
        if r2files is not None:
            r2files = [r2files]
    if not all(map(os.path.isfile, r1files)):
        raise ValueError('cannot find all `r1files`')
    if r2files is None:
        r2files = [None] * len(r1files)
    elif len(r1files) != len(r2files):
        raise ValueError('`r1files` and `r2files` differ in length')
    elif not all(map(os.path.isfile, r2files)):
        raise ValueError('cannot find all `r2files`')
    for (r1file, r2file) in zip(r1files, r2files):
        r1reader = pysam.FastxFile(r1file)
        if r2file is None:
            read_iterator = r1reader
        else:
            r2reader = pysam.FastxFile(r2file)
            read_iterator = zip(r1reader, r2reader)
        for tup in read_iterator:
            if r2file is None:
               a1 = tup
               r2 = q2 = None
            else:
                a1, a2 = tup
                r2 = a2.sequence
                q2 = a2.quality
                if a2.comment is not None:
                    id2 = f"{a2.name} {a2.comment}".split()
                else:
                    id2 = a2.name.split()
                name2 = id2[0]
            r1 = a1.sequence
            q1 = a1.quality
            if a1.comment is not None:
                id1 = f"{a1.name} {a1.comment}".split()
            else:
                id1 = a1.name.split()
            name1 = id1[0]
            if r2file is not None:
                # trims last two chars, need for SRA downloaded files
                if name1[-2 : ] == '.1' and name2[-2 : ] == '.2':
                    name1 = name1[ : -2]
                    name2 = name2[ : -2]
                if name1 != name2:
                    raise ValueError(f"name mismatch {name1} vs {name2}")
            # parse chastity filter assuming CASAVA 1.8 header
            try:
                f1 = id1[1][2]
                if r2file is None:
                    f2 = 'N'
                else:
                    f2 = id2[1][2]
                if f1 == 'N' and f2 == 'N':
                    fail = False
                elif f1 in ['N', 'Y'] and f2 in ['N', 'Y']:
                    fail = True
            except IndexError:
                fail = None # header does not specify chastity filter
            if r1trim is not None:
                r1 = r1[ : r1trim]
                q1 = q1[ : r1trim]
            if (r2trim is not None) and (r2file is not None):
                r2 = r2[ : r2trim]
                q2 = q2[ : r2trim]
            yield (name1, r1, r2, q1, q2, fail)


[docs]def lowQtoN(r, q, minq, use_cutils=True):
    """Replaces low quality nucleotides with ``N`` characters.

    Args:
        `r` (str)
            A string representing a sequencing read.
        `q` (str)
            String of same length as `r` holding Q scores
            in Sanger ASCII encoding.
        `minq` (length-one string)
            Replace all positions in `r` where `q` is < this.
        `use_cutils` (bool)
            Use the faster implementation in the `_cutils` module.

    Returns:
        A version of `r` where all positions `i` where 
        `q[i] < minq` have been replaced with ``N``.

    >>> r = 'ATGCAT'
    >>> q = 'GB<.0+'
    >>> minq = '0'
    >>> lowQtoN(r, q, minq) == 'ATGNAN'
    True
    """
    if use_cutils:
        return dms_tools2._cutils.lowQtoN(r, q, minq)
    assert len(r) == len(q)
    return ''.join([ri if qi >= minq else 'N'
            for (ri, qi) in zip(r, q)])


[docs]def buildReadConsensus(reads, minreads, minconcur, use_cutils=True):
    """Builds consensus sequence of some reads.

    You may want to pre-fill low-quality sites with ``N``
    using `lowQtoN`. An ``N`` is considered a non-called identity.

    Args:
        `reads` (list)
            List of reads as strings. If reads are not all same
            length, shorter ones are extended from 3' end with ``N``
            to match maximal length. 
        `minreads` (int)
            Only call consensus at a site if at least this many reads 
            have called identity.
        `minconcur` (float)
            Only call consensus at site if >= this fraction of called
            identities agree.
        `use_cutils` (bool)
            Use the faster implementation in the `_cutils` module.

    Returns:
        A string giving the consensus sequence. Non-called 
        sites are returned as ``N```.

    >>> reads = ['ATGCAT',
    ...          'NTGNANA',
    ...          'ACGNNTAT',
    ...          'NTGNTA']
    >>> buildReadConsensus(reads, 2, 0.75) == 'ATGNNNAN'
    True
    >>> reads.append('CTGCATAT')
    >>> buildReadConsensus(reads, 2, 0.75) == 'NTGCATAT'
    True
    """
    if use_cutils:
        return dms_tools2._cutils.buildReadConsensus(reads, 
                minreads, minconcur)
    readlens = list(map(len, reads))
    maxlen = max(readlens)
    consensus = []
    for i in range(maxlen):
        counts = {}
        for (r, lenr) in zip(reads, readlens):
            if lenr > i:
                x = r[i]
                if x != 'N':
                    if x in counts:
                        counts[x] += 1
                    else:
                        counts[x] = 1
        ntot = sum(counts.values())
        if ntot < minreads:
            consensus.append('N')
        else:
            (nmax, xmax) = sorted([(n, x) for (x, n) in counts.items()])[-1]
            if nmax / float(ntot) >= minconcur:
                consensus.append(xmax)
            else:
                consensus.append('N')
    return ''.join(consensus)


[docs]def rarefactionCurve(barcodes, *, maxpoints=1e5, logspace=True):
    """Rarefaction curve from list of barcodes.

    Uses the analytical formula for the rarefaction curve defined
    `on Wikipedia <https://en.wikipedia.org/wiki/Rarefaction_(ecology)#Derivation>`_.

    Args:
        `barcodes` (list or pandas Series)
            Holds the list of unique barcodes for which we calculate
            the rarefaction curve. It is expected that some of these
            barcodes will be repeated multiple times in the list if
            the sampling is approaching saturation.
        `maxpoints` (int)
            Only calculate values at this many points. The benefit
            of this is that it can become very costly to calculate
            the curve at every point when there are many points.
        `logspace` (True)
            Logarithmically space the `maxpoints` points for
            the calculation. This will give better results if
            we are subsampling and the curve saturates. Only
            done if we have to subsample.

    Returns:
        The 2-tuple `(nreads, nbarcodes)`, where both `nreads` and
        `nbarcodes` are lists of the same length, and `nbarcodes[i]`
        is the expected number of barcodes observed when there are
        `nreads[i]` reads.

    Here we take a very small list and show that the results given
    by the function are equivalent to those obtained by random
    subsampling:

    >>> barcodes = ['A', 'A', 'A', 'A', 'G', 'G', 'C', 'T']
    >>> (nreads, nbarcodes) = rarefactionCurve(barcodes)
    >>> random.seed(1)
    >>> nrand = 100000
    >>> sim_equal_calc = []
    >>> for n in range(1, len(barcodes) + 1):
    ...     nbarcodes_sim = sum([len(set(random.sample(barcodes, n)))
    ...             for _ in range(nrand)]) / nrand
    ...     sim_equal_calc.append(numpy.allclose(nbarcodes_sim,
    ...             nbarcodes[nreads.index(n)], atol=1e-2))
    >>> all(sim_equal_calc)
    True
    """
    N = len(barcodes) # total number of items
    Ni = collections.Counter(barcodes)
    K = len(Ni)
    Mj = collections.Counter(Ni.values())

    Nk, num = map(numpy.array, zip(*Mj.items()))

    # use simplification that (N - Ni)Cr(n) / (N)Cr(n) =
    # [(N - Ni)! * (N - n)!] / [N! * (N - Ni - n)!]
    #
    # Also use fact that gamma(x + 1) = x!
    nbarcodes = []
    lnFactorial_N = scipy.special.gammaln(N + 1)
    if logspace and N > maxpoints:
        nreads = list(numpy.unique(numpy.logspace(
                math.log10(1), math.log10(N),
                num=int(min(N, maxpoints))).astype('int')))
    else:
        nreads = list(numpy.unique(numpy.linspace(
                1, N, num=min(N, maxpoints)).astype('int')))
    for n in nreads:
        lnFactorial_N_minus_n = scipy.special.gammaln(N - n + 1)
        i = numpy.nonzero(N - Nk - n >= 0) # indices where this is true
        nbarcodes.append(
                K - (num[i] * numpy.exp(
                            scipy.special.gammaln(N - Nk[i] + 1) +
                            lnFactorial_N_minus_n -
                            lnFactorial_N -
                            scipy.special.gammaln(N - Nk[i] - n + 1))
                    ).sum()
                )
    return (nreads, nbarcodes)


[docs]def reverseComplement(s, use_cutils=True):
    """Gets reverse complement of DNA sequence `s`.

    Args:
        `s` (str)
            Sequence to reverse complement.
        `use_cutils` (bool)
            Use the faster implementation in the `_cutils` module.

    Returns:
        Reverse complement of `s` as a str.

    >>> s = 'ATGCAAN'
    >>> reverseComplement(s) == 'NTTGCAT'
    True
    """
    if use_cutils:
        return dms_tools2._cutils.reverseComplement(s)
    return ''.join(reversed([dms_tools2.NTCOMPLEMENT[nt] for nt in s]))


[docs]def alignSubamplicon(refseq, r1, r2, refseqstart, refseqend, maxmuts,
        maxN, chartype, use_cutils=True):
    """Try to align subamplicon to reference sequence at defined location.

    Tries to align reads `r1` and `r2` to `refseq` at location
    specified by `refseqstart` and `refseqend`. Determines how many
    sites of type `chartype` have mutations, and if <= `maxmuts` conside
    the subamplicon to align if fraction of ambiguous nucleotides <= `maxN`.
    In `r1` and `r2`, an ``N`` indicates a non-called ambiguous identity.
    If the reads disagree in a region of overlap that is set to ``N`` in
    the final subamplicon, but if one read has ``N`` and the other a called
    identity, then the called identity is used in the final subamplicon.

    Args:
        `refseq` (str)
            Sequence to which we align. if `chartype` is 'codon',
            must be a valid coding (length multiple of 3).
        `r1` (str)
            The forward sequence to align.
        `r2` (str)
            The reverse sequence to align. When reverse complemented,
            should read backwards in `refseq`.
        `refseqstart` (int)
            The nucleotide in `refseq` (1, 2, ... numbering) where the
            first nucleotide in `r1` aligns.
        `refseqend` (int)
            The nucleotide in `refseq` (1, 2, ... numbering) where the
            first nucleotide in `r2` aligns (note that `r2` then reads
            backwards towards the 5' end of `refseq`).
        `maxmuts` (int or float)
            Maximum number of mutations of character `chartype` that
            are allowed in the aligned subamplicons from the two reads.
        `maxN` (int or float)
            Maximum number of nucleotides for which we allow
            ambiguous (``N``) identities in final subamplicon.
        `chartype` (str)
            Character type for which we count mutations.
            Currently, the only allowable value is 'codon'.
        `use_cutils` (bool)
            Use the faster implementation in the `_cutils` module.

    Returns:
        If reads align, return aligned subamplicon as string (of length
        `refseqend - refseqstart + 1`). Otherwise return `None`.

    >>> refseq = 'ATGGGGAAA'
    >>> s = alignSubamplicon(refseq, 'GGGGAA', 'TTTCCC', 3, 9, 1, 1, 'codon')
    >>> s == 'GGGGAAA' 
    True
    >>> s = alignSubamplicon(refseq, 'GGGGAA', 'TTTCCC', 1, 9, 1, 1, 'codon')
    >>> s == False
    True
    >>> s = alignSubamplicon(refseq, 'GGGGAT', 'TTTCCC', 3, 9, 1, 0, 'codon')
    >>> s == False
    True
    >>> s = alignSubamplicon(refseq, 'GGGGAT', 'TTTCCC', 3, 9, 1, 1, 'codon')
    >>> s == 'GGGGANA'
    True
    >>> s = alignSubamplicon(refseq, 'GGGGAT', 'TATCCC', 3, 9, 1, 0, 'codon')
    >>> s == 'GGGGATA'
    True
    >>> s = alignSubamplicon(refseq, 'GGGGAT', 'TATCCC', 3, 9, 0, 0, 'codon')
    >>> s == False
    True
    >>> s = alignSubamplicon(refseq, 'GGGNAA', 'TTTCCC', 3, 9, 0, 0, 'codon')
    >>> s == 'GGGGAAA'
    True
    >>> s = alignSubamplicon(refseq, 'GGGNAA', 'TTNCCC', 3, 9, 0, 0, 'codon')
    >>> s == 'GGGGAAA'
    True
    >>> s = alignSubamplicon(refseq, 'GTTTAA', 'TTTAAA', 3, 9, 1, 0, 'codon')
    >>> s == 'GTTTAAA' 
    True
    >>> s = alignSubamplicon(refseq, 'GGGGTA', 'TTACCC', 3, 9, 1, 0, 'codon')
    >>> s == 'GGGGTAA' 
    True
    >>> s = alignSubamplicon(refseq, 'GGGCTA', 'TTAGCC', 3, 9, 1, 0, 'codon')
    >>> s == False 
    True
    """
    r2 = reverseComplement(r2)

    if use_cutils:
        return dms_tools2._cutils.alignSubamplicon(refseq, r1, r2, 
                refseqstart, refseqend, maxmuts, maxN, chartype)

    assert chartype in ['codon'], "Invalid chartype"
    if chartype == 'codon':
        assert len(refseq) % 3 == 0, "refseq length not divisible by 3"

    len_subamplicon = refseqend - refseqstart + 1
    len_r1 = len(r1)
    len_subamplicon_minus_len_r2 = len_subamplicon - len(r2)
    subamplicon = []
    for i in range(len_subamplicon):
        if i < len_subamplicon_minus_len_r2: # site not in r2
            if i < len_r1: # site in r1
                subamplicon.append(r1[i])
            else: # site not in r1
                subamplicon.append('N')
        else: # site in r2
            if i < len_r1: # site in r1
                r1i = r1[i]
                r2i = r2[i - len_subamplicon_minus_len_r2]
                if r1i == r2i:
                    subamplicon.append(r1i)
                elif r1i == 'N':
                    subamplicon.append(r2i)
                elif r2i == 'N':
                    subamplicon.append(r1i)
                else:
                    subamplicon.append('N')
            else: # site not in r1
                subamplicon.append(r2[i - len_subamplicon_minus_len_r2])
    subamplicon = ''.join(subamplicon)

    if subamplicon.count('N') > maxN:
        return False

    if chartype == 'codon':
        if refseqstart % 3 == 1:
            startcodon = (refseqstart + 2) // 3
            codonshift = 0
        elif refseqstart % 3 == 2:
            startcodon = (refseqstart + 1) // 3 + 1
            codonshift = 2
        elif refseqstart % 3 == 0:
            startcodon = refseqstart // 3 + 1
            codonshift = 1
        nmuts = 0
        for icodon in range(startcodon, refseqend // 3 + 1):
            mutcodon = subamplicon[3 * (icodon - startcodon) + codonshift : 
                    3 * (icodon - startcodon) + 3 + codonshift]
            if ('N' not in mutcodon) and (mutcodon != 
                    refseq[3 * icodon - 3 : 3 * icodon]):
                nmuts += 1
                if nmuts > maxmuts:
                    return False
    else:
        raise ValueError("Invalid chartype")

    return subamplicon


[docs]def incrementCounts(refseqstart, subamplicon, chartype, counts):
    """Increment counts dict based on an aligned subamplicon.

    This is designed for keeping track of counts of different
    mutations / identities when aligning many subamplicons to
    a sequence.

    Any positions where `subamplicon` has an ``N`` are ignored,
    and not added to `counts`.

    Args:
        `refseqstart` (int)
            First nucleotide position in 1, 2, ... numbering 
            where `subamplicon` aligns.
        `subamplicon` (str)
            The subamplicon.
        `chartype` (str)
            Character type for which we are counting mutations.
            Currently, only allowable value is 'codon'.
        `counts` (dict)
            Stores counts of identities, and is incremented by
            this function. Is a dict keyed by every possible
            character (e.g., codon), with values lists with
            element `i` holding the counts for position `i`
            in 0, 1, ... numbering.

    Returns:
        On completion, `counts` has been incremented.

    >>> codonlen = 10
    >>> counts = dict([(codon, [0] * codonlen) for codon 
    ...         in CODONS])
    >>> subamplicon1 = 'ATGGACTTTC'
    >>> incrementCounts(1, subamplicon1, 'codon', counts)
    >>> subamplicon2 = 'GGTCTTTCCCGGN'
    >>> incrementCounts(3, subamplicon2, 'codon', counts)
    >>> counts['ATG'][0] == 1
    True
    >>> counts['GAC'][1] == 1
    True
    >>> counts['GTC'][1] == 1
    True
    >>> counts['TTT'][2] == 2
    True
    >>> counts['CCC'][3] == 1
    True
    >>> sum([sum(c) for c in counts.values()]) == 6
    True
    """
    if chartype == 'codon':
        if refseqstart % 3 == 1:
            startcodon = (refseqstart + 2) // 3 - 1
            codonshift = 0
        elif refseqstart % 3 == 2:
            startcodon = (refseqstart + 1) // 3 
            codonshift = 2
        elif refseqstart % 3 == 0:
            startcodon = refseqstart // 3 
            codonshift = 1
    else:
        raise ValueError("Invalid chartype")

    shiftedsubamplicon = subamplicon[codonshift : ]
    for i in range(len(shiftedsubamplicon) // 3):
        codon = shiftedsubamplicon[3 * i : 3 * i + 3]
        if 'N' not in codon:
            counts[codon][startcodon + i] += 1


[docs]def codonToAACounts(counts):
    """Makes amino-acid counts `pandas.DataFrame` from codon counts.

    Args:
        `counts` (`pandas.DataFrame`)
            Columns are the string `site` `wildtype` and all codons
            in `CODONS`. Additional columns are allowed
            but ignored.

    Returns:
        `aacounts` (`pandas.DataFrame`)
            Columns are the string `site` and all amino acids
            in `AAS_WITHSTOP` with counts for each
            amino acid made by summing counts for encoding codons.

    >>> d = {'site':[1, 2], 'othercol':[0, 0], 'ATG':[105, 1],
    ...         'GGG':[3, 117], 'GGA':[2, 20], 'TGA':[0, 1],
    ...         'wildtype':['ATG', 'GGG']}
    >>> for codon in CODONS:
    ...     if codon not in d:
    ...         d[codon] = [0, 0]
    >>> counts = pandas.DataFrame(d)
    >>> aacounts = codonToAACounts(counts)
    >>> 'othercol' in aacounts.columns
    False
    >>> all(aacounts['site'] == [1, 2])
    True
    >>> all(aacounts['wildtype'] == ['M', 'G'])
    True
    >>> all(aacounts['M'] == [105, 1])
    True
    >>> all(aacounts['G'] == [5, 137])
    True
    >>> all(aacounts['*'] == [0, 1])
    True
    >>> all(aacounts['V'] == [0, 0])
    True
    """
    d = dict([(key, []) for key in ['site', 'wildtype'] + 
            AAS_WITHSTOP])
    for (i, row) in counts.iterrows():
        d['site'].append(row['site'])
        d['wildtype'].append(CODON_TO_AA[row['wildtype']])
        for aa in AAS_WITHSTOP:
            d[aa].append(0)
        for c in CODONS:
            d[CODON_TO_AA[c]][-1] += (row[c])
    return pandas.DataFrame(d)


[docs]def annotateCodonCounts(counts):
    """Gets annotated `pandas.DataFrame` from codon counts.

    Some of the programs (e.g., `dms2_bcsubamplicons`) create 
    ``*_codoncounts.csv`` files when run with ``--chartype codon``.
    These CSV files have columns indicating the `site` and `wildtype`
    codon, as well as a column for each codon giving the counts for that 
    codon. This function reads that file (or a `pandas.DataFrame` read
    from it) to return a `pandas.DataFrame` where a variety of additional
    useful annotations have been added.

    Args:
        `counts` (str)
            Name of existing codon counts CSV file, or `pandas.DataFrame`
            holding counts.

    Returns:
        `df` (`pandas.DataFrame`)
            The DataFrame with the information in `counts` plus
            the following added columns for each site:

                `ncounts` : number of counts at site

                `mutfreq` : mutation frequency at site

                `nstop` : number of stop-codon mutations

                `nsyn` : number of synonymous mutations

                `nnonsyn` : number of nonsynonymous mutations

                `n1nt` : number of 1-nucleotide codon mutations

                `n2nt` : number of 2-nucleotide codon mutations

                `n3nt` : number of 3-nucleotide codon mutations

                `AtoC`, `AtoG`, etc : number of each nucleotide mutation
                type among codon mutations with **one** nucleotide change.

                `mutfreq1nt`, `mutfreq2nt`, `mutfreq3nt` : frequency
                of 1-, 2-, and 3-nucleotide codon mutations at site.

    >>> d = {'site':[1, 2], 'wildtype':['ATG', 'GGG'], 'ATG':[105, 1],
    ...         'GGG':[3, 117], 'GGA':[2, 20], 'TGA':[0, 1]}
    >>> for codon in CODONS:
    ...     if codon not in d:
    ...         d[codon] = [0, 0]
    >>> counts = pandas.DataFrame(d)
    >>> with tempfile.NamedTemporaryFile(mode='w') as f:
    ...     counts.to_csv(f, index=False)
    ...     f.flush()
    ...     df = annotateCodonCounts(f.name)
    >>> all([all(df[col] == counts[col]) for col in counts.columns])
    True
    >>> all(df['ncounts'] == [110, 139])
    True
    >>> all(df['mutfreq'] == [5 / 110., 22 / 139.])
    True
    >>> all(df['nstop'] == [0, 1])
    True
    >>> all(df['nsyn'] == [0, 20])
    True
    >>> all(df['nnonsyn'] == [5, 1])
    True
    >>> all(df['n1nt'] == [0, 20])
    True
    >>> all(df['n2nt'] == [3, 2])
    True
    >>> all(df['n3nt'] == [2, 0])
    True
    >>> all(df['GtoA'] == [0, 20])
    True
    >>> all(df['AtoC'] == [0, 0])
    True
    >>> all(df['mutfreq1nt'] == [0, 20 / 139.])
    True
    >>> all(df['mutfreq3nt'] == [2 / 110., 0])
    True
    """
    if isinstance(counts, str):
        df = pandas.read_csv(counts)
    elif isinstance(counts, pandas.DataFrame):
        df = counts.copy()
    else:
        raise ValueError("invalid counts")
    assert set(CODONS) <= set(df.columns), \
            "Did not find counts for all codons".format(counts)

    df['ncounts'] = df[CODONS].sum(axis=1)

    df['mutfreq'] = (((df['ncounts'] - df.lookup(df['wildtype'].index,
            df['wildtype'].values)) / df['ncounts'].astype('float'))
            .fillna(0))

    ntchanges = ['{0}to{1}'.format(nt1, nt2) for nt1 in dms_tools2.NTS
            for nt2 in dms_tools2.NTS if nt1 != nt2]

    nstoplist = []
    nsynlist = []
    nnonsynlist = []
    nXntlists = dict([(n + 1, []) for n in range(3)])
    nntchangeslists = dict([(ntchange, []) for ntchange in ntchanges])
    for (i, row) in df.iterrows():
        nstop = nsyn = nnonsyn = 0
        nXnt = dict([(n + 1, 0) for n in range(3)])
        nntchanges = dict([(ntchange, 0) for ntchange in ntchanges])
        wt = row['wildtype']
        wtaa = CODON_TO_AA[wt]
        for c in CODONS:
            if c == wt:
                continue
            aa = CODON_TO_AA[c]
            if aa == '*':
                nstop += row[c]
            elif aa == wtaa:
                nsyn += row[c]
            else:
                nnonsyn += row[c]
            ntdiffs = ['{0}to{1}'.format(nt1, nt2) for (nt1, nt2) 
                    in zip(wt, c) if nt1 != nt2]
            nXnt[len(ntdiffs)] += row[c]
            if len(ntdiffs) == 1:
                nntchanges[ntdiffs[0]] += row[c]
        nstoplist.append(nstop)
        nsynlist.append(nsyn)
        nnonsynlist.append(nnonsyn)
        for n in range(3):
            nXntlists[n + 1].append(nXnt[n + 1])
        for ntchange in ntchanges:
            nntchangeslists[ntchange].append(nntchanges[ntchange])
    df = df.assign(nstop=nstoplist, nsyn=nsynlist, nnonsyn=nnonsynlist)
    df = df.assign(n1nt=nXntlists[1], n2nt=nXntlists[2], n3nt=nXntlists[3])
    for ntchange in ntchanges:
        df[ntchange] = nntchangeslists[ntchange]

    for nnt in range(3):
        df['mutfreq{0}nt'.format(nnt + 1)] = (df['n{0}nt'.format(nnt + 1)]
            / df['ncounts'].astype('float')).fillna(0)

    return df


[docs]def adjustErrorCounts(errcounts, counts, charlist, maxexcess):
    """Adjust error counts to not greatly exceed counts of interest.

    This function is useful when estimating preferences. Under the
    model, the error-control should not have a higher rate of error
    than the actual sample. However, this could happen if the experimental
    data don't fully meet the assumptions. So this function scales
    down the error counts in that case.

    Args:
        `errcounts` (pandas.DataFrame)
            Holds counts for error control.
        `counts` (pandas.DataFrame)
            Holds counts for which we are correcting errors.
        `charlist` (list)
            Characters for which we have counts.
        `maxexcess` (int)
            Only let error-control counts exceed actual by this much.

    Returns:
        A copy of `errcounts` except for any non-wildtype character,
        the maximum frequency of that character is adjusted to be
        at most the number predicted by the frequency in `counts`
        plus `maxexcess`.

    >>> counts = pandas.DataFrame({'site':[1], 'wildtype':['A'], 
    ...         'A':500, 'C':10, 'G':40, 'T':20})
    >>> errcounts = pandas.DataFrame({'site':[1], 'wildtype':['A'],
    ...         'A':250, 'C':1, 'G':30, 'T':10})
    >>> charlist = ['A', 'C', 'G', 'T']
    >>> errcounts = errcounts[['site', 'wildtype'] + charlist]
    >>> adj_errcounts = adjustErrorCounts(errcounts, counts, charlist, 1)
    >>> set(adj_errcounts.columns) == set(errcounts.columns)
    True
    >>> all(adj_errcounts['site'] == errcounts['site'])
    True
    >>> all(adj_errcounts['wildtype'] == errcounts['wildtype'])
    True
    >>> (adj_errcounts[adj_errcounts['site'] == 1][charlist].values[0]
    ...         == numpy.array([250, 1, 21, 10])).all()
    True
    """
    cols = counts.columns
    counts = counts.sort_values('site')
    errcounts = errcounts.sort_values('site')
    assert all(counts['site'] == errcounts['site'])
    assert all(counts['wildtype'] == errcounts['wildtype'])
    counts['total'] = counts[charlist].sum(axis=1).astype('float')
    errcounts['total'] = errcounts[charlist].sum(axis=1)
    maxallowed = (counts[charlist].div(counts['total'], axis=0).multiply(
            errcounts['total'], axis=0) + maxexcess).round().astype('int')
    adj_errcounts = errcounts[charlist].where(errcounts[charlist] < maxallowed, 
            maxallowed[charlist])
    for c in charlist:
        adj_errcounts[c] = adj_errcounts[c].where(counts['wildtype'] != c,
                errcounts[c])
    for col in cols:
        if col not in charlist:
            adj_errcounts[col] = counts[col]
    return adj_errcounts[cols]


[docs]def convertCountsFormat(oldfile, newfile, charlist):
    """Convert counts file from ``dms_tools`` to ``dms_tools2`` format.

    Args:
        `oldfile` (str)
            Name of counts file in the old ``dms_tools`` format:
            http://jbloomlab.github.io/dms_tools/fileformats.html
        `newfile` (str)
            Name of created counts file in the ``dms_tools2`` format:
            https://jbloomlab.github.io/dms_tools2/dms2_bcsubamp.html
        `charlist` (list)
            List of characters that we expect in the counts files.
            For instance, could be `CODONS`.
    """
    with open(oldfile) as f:
        header = f.readline()
    assert header[0] == '#'
    cols = header[1 : ].split()
    assert cols[0] == 'POSITION' and cols[1] == 'WT'
    cols = ['site', 'wildtype'] + cols[2 : ]
    assert set(charlist) == set(cols[2 : ])
    old = pandas.read_csv(oldfile, delim_whitespace=True, 
            names=cols, comment='#')
    old.to_csv(newfile, index=False)


[docs]def renumberSites(renumbfile, infiles, missing='error',
        outfiles=None, outprefix=None, outdir=None):
    """Renumber sites in CSV files.

    Switch numbering scheme in files with a column named `site`. 

    You must specify **exactly one** of `outfiles`,
    `outprefix`, and `outdir` as something other than `None`.

    Args:
        `renumbfile` (str)
            Name of existing CSV file with the re-numbering scheme.
            Should have columns with name `original` and `new`.
            Each entry in `original` should refer to a site in 
            the input files, and each entry in `new` should be
            the new number for this site. If an entry in `new`
            is `None` or `nan` then it is dropped from the newly
            numbered files regardless of `missing`.
        `infiles` (list)
            List of existing CSV files that we are re-numbering.
            Each file must have an entry of `site`.
        `missing` (str)
            How to handle sites in `infiles` but not `renumbfile`.
                - `error`: raise an error
                - `skip`: skip renumbering, leave with original number 
                - `drop`: drop any sites not in `renumbfile`
        `outfiles` (list)
            List of output files of the same length as `infiles`.
            The numbered version of `infiles` is named as the
            corresponding entry in `outfiles`.
        `outdir` (str)
            A directory name. The renumbered files have the same
            names as in `infile`, but are now placed in `outdir`.
        `outprefix` (str)
            The renumbered files have the same names and locations
            as `infiles`, but have the pre-pended filename extension
            `outprefix`.
    """
    assert os.path.isfile(renumbfile), "no renumbfile {0}".format(renumbfile)
    renumb = pandas.read_csv(renumbfile)
    assert {'original', 'new'} <=  set(renumb.columns), \
            "renumbfile lacks columns `original` and/or `new`"
    for col in ['original', 'new']:
        assert len(renumb[col]) == len(set(renumb[col])), \
                "duplicate sites for {0} in {1}".format(col, renumbfile)
        renumb[col] = renumb[col].astype('str')

    assert isinstance(infiles, list), "infiles is not a list"
    nin = len(infiles)
    infiles = [os.path.abspath(f) for f in infiles]
    assert len(set(infiles)) == nin, "duplicate files in `infiles`"

    if outfiles is not None:
        assert isinstance(outfiles, list), "`outfiles` not list"
        assert (outdir is None) and (outprefix is None), \
                "only specify one of `outfiles`, `outdir`, and `outprefix`"
        nout = len(outfiles)
        assert nout == nin, "`outfiles` and `infiles` different length"

    elif outdir is not None:
        assert isinstance(outdir, str), "`outdir` should be string"
        assert (outfiles is None) and (outprefix is None), \
                "only specify one of `outfiles`, `outdir`, and `outprefix`"
        if not os.path.isdir(outdir):
            os.mkdir(outdir)
        outfiles = [os.path.join(outdir, os.path.basename(f))
                for f in infiles]

    elif outprefix is not None:
        assert isinstance(outprefix, str), "`outdir` should be string"
        assert (outfiles is None) and (outdir is None), \
                "only specify one of `outfiles`, `outdir`, and `outprefix`"
        outfiles = [os.path.join(os.path.dirname(f), outprefix + 
                os.path.basename(f)) for f in infiles]

    else:
        raise ValueError("specify `outdir`, `outprefix`, `outfiles`")

    outfiles = [os.path.abspath(f) for f in outfiles]
    assert len(set(outfiles)) == len(outfiles), "duplicate files in `outfiles`"
    assert not set(outfiles).intersection(set(infiles)), \
            "some in and outfiles the same"

    for (fin, fout) in zip(infiles, outfiles):
        df_in = pandas.read_csv(fin)
        assert 'site' in df_in.columns, "no `site` column in {0}".format(fin)
        df_in['site'] = df_in['site'].astype('str')
        if missing == 'error':
            if set(df_in['site']) > set(renumb['original']):
                raise ValueError("`missing` is `error`, excess sites in {0}"
                    .format(fin))
        elif missing == 'skip':
            pass
        elif missing == 'drop':
            df_in = df_in[df_in['site'].isin(renumb['original'])]
        else:
            raise ValueError("invalid `missing` of {0}".format(missing))

        # can't just use replace below because of this bug:
        # https://github.com/pandas-dev/pandas/issues/16051
        unmappedsites = df_in[~df_in['site'].isin(renumb['original'])]['site']
        replacemap = dict(zip(
                renumb['original'].append(unmappedsites),
                renumb['new'].append(unmappedsites)))
        df_in['site'] = df_in['site'].map(replacemap)

        df_in = (df_in[df_in['site'].notnull()]
                      .query('site != "NaN"')
                      .query('site != "nan"')
                      .query('site != "None"')
                      )

        df_in.to_csv(fout, index=False)


[docs]def codonEvolAccessibility(seqs):
    """Accessibility of amino acids by nucleotide mutations.

    Args:
        `seqs` (str or list)
            A single coding sequence or a list of such sequences.

    Returns:
        A pandas DataFrame listing all sites in the sequence(s)
        numbered 1, 2, ..., with columns giving the accessibility
        of each amino acid by single nucleotide mutations.

    The accessibility of codon :math:`c` to amino-acid :math:`a`
    by single-nucleotide mutations is defined as the minimum
    number of nucleotide mutations needed to generate that
    amino-acid. 

    For a collection of sequences, we calculate the 
    accessibility as the weighted average of the accessibilities
    of all codons observed at that site in the collection of
    sequences.

    As an example, compute accessibility for one sequence:

    >>> s = "ATGGGA"
    >>> acc = codonEvolAccessibility(s)

    The returned pandas DataFrame `acc` is has a column named
    `site` plus columns for all amino acids:

    >>> all(acc.columns == ['site'] + AAS_WITHSTOP)
    True

    We look at entries for a few amino acids. At the first 
    site, the wildtype entry in the sequence `s` is the codon
    for *M* (``ATG``). So at this site, the distance to *M*
    is 0. The distance to *I* (which has codon ``ATA`` as a
    codon) is 1, and the distance to *W* (which has only ``TGG``
    as a codon) is 2.

    >>> acc[['site', 'G', 'I', 'M', 'W']]
       site    G    I    M    W
    0     1  2.0  1.0  0.0  2.0
    1     2  0.0  2.0  3.0  2.0

    If we pass the function a list of multiple sequences,
    then the accessibilities are averaged over the sequences:

    >>> acc2 = codonEvolAccessibility(['ATGGGA', 'ATAGGA'])
    >>> acc2[['site', 'G', 'I', 'M', 'W']]
       site    G    I    M    W
    0     1  2.0  0.5  0.5  2.5
    1     2  0.0  2.0  3.0  2.0
    """
    # get number of nucleotide diffs between all pairs of codons
    nt_diffs = dict([
            ((c1, c2), sum(1 for x1, x2 in zip(c1, c2) if x1 != x2))
            for c1, c2 in itertools.product(CODONS, repeat=2)])

    # get number of nucleotide diffs to nearest codon for amino acid
    aa_nt_diffs = {}
    for c in CODONS:
        for aa, othercs in AA_TO_CODONS.items():
            aa_nt_diffs[(c, aa)] = min([nt_diffs[(c, c2)] 
                    for c2 in othercs])

    # make sure seqs are of same valid length
    if isinstance(seqs, str):
        seqs = [seqs]
    assert len(seqs[0]) % 3 == 0, "seqs not of length divisible by 3"
    assert all([len(seqs[0]) == len(s) for s in seqs[1 : ]]), \
            "seqs not all of same length"

    # get nucleotide distances, summing for all sequences
    dists = collections.defaultdict(lambda: collections.defaultdict(float))
    for s in seqs:
        for r in range(len(s) // 3):
            c = s[3 * r : 3 * r + 3]
            assert c in CODONS, "invalid codon {0}".format(c)
            for aa in AAS_WITHSTOP:
                dists[r + 1][aa] += aa_nt_diffs[(c, aa)]

    return (pandas.DataFrame.from_dict(dists, orient='index')
            .rename_axis('site')
            [AAS_WITHSTOP]
            / len(seqs)).reset_index()


[docs]def sigFigStr(x, nsig):
    """Get str of `x` with `nsig` significant figures.

    >>> sigFigStr(11190, 2)
    '11000'
    >>> sigFigStr(117, 2)
    '120'
    >>> sigFigStr(6, 2)
    '6.0'
    >>> sigFigStr(0.213, 2)
    '0.21'
    >>> sigFigStr(0.007517, 3)
    '0.00752'
    """
    if x <= 0:
        raise ValueError('currently only handles numbers > 0')
    x = float(f"{{:.{nsig}g}}".format(x))
    if x >= 10**(nsig - 1):
        return '{:d}'.format(round(x))
    else:
        predecimal = math.floor(math.log10(x)) + 1
        postdecimal = nsig - predecimal
        assert postdecimal > 0, str(x)
        return f"{{:.{postdecimal}f}}".format(x)

[docs]def getSubstitutions(wildtype, mutant, amino_acid=False):
    """Get space delimited string of substitutions

    Args:
        `wildtype` (str):
             The wildtype sequence
        `mutant` (str):
             The mutant sequence
        `amino_acid` (bool)
             Specify whether the sequence is amino acid.
             Default is False
    Returns:
        A space delimited string of substitutions present in the
        mutant sequence

    >>> getSubstitutions('AGT', 'TGT')
    'A1T'
    >>> getSubstitutions('AAGTAACGA', 'ATCTAACGA')
    'A2T G3C'
    >>> getSubstitutions('TYARV', 'GYAGV', amino_acid=True)
    'T1G R4G'
    """
    if len(wildtype) != len(mutant):
        raise ValueError('wildtype and mutant must be same length')
    subs = []
    for site in range(len(wildtype)):
        wt = wildtype[site]
        mut = mutant[site]
        if amino_acid:
            if wt not in AAS_WITHSTOP:
                raise ValueError (f"Invalid wt residue {wt} at site {site+1}")
            if mut not in AAS_WITHSTOP:
                raise ValueError (f"Invalid mutant residue {mut} at site {site+1}")
        else:
            if wt not in NTS:
                raise ValueError (f"Invalid wt nucleotide {wt} at site {site+1}")
            if mut not in NTS:
                raise ValueError (f"Invalid mutant nucleotide {mut} at site {site+1}")
        if wt!=mut:
            pos = str(site + 1)
            subs.append(f"{wt}{pos}{mut}")
    subs = ' '.join(subs)

    return subs


[docs]def codon_to_nt_counts(codoncounts):
    """Convert codon counts file to nucleotide counts.

    Args:
        `codoncounts` (str or pandas.DataFrame)
            Codon counts in format produced by ``dms2_bcsubamp``,
            either as CSV file or data frame holding CSV.

    Returns:
        pandas.DataFrame with nucleotide counts.

    Example:

    >>> with tempfile.NamedTemporaryFile('w') as f:
    ...     _ = f.write(textwrap.dedent('''
    ...             site,wildtype,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,AGA,AGC,AGG,AGT,ATA,ATC,ATG,ATT,CAA,CAC,CAG,CAT,CCA,CCC,CCG,CCT,CGA,CGC,CGG,CGT,CTA,CTC,CTG,CTT,GAA,GAC,GAG,GAT,GCA,GCC,GCG,GCT,GGA,GGC,GGG,GGT,GTA,GTC,GTG,GTT,TAA,TAC,TAG,TAT,TCA,TCC,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
    ...             1,ATG,0,0,0,0,0,0,2,0,0,0,0,0,8,0,333985,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    ...             2,AAG,16,20,333132,41,13,12,27,14,8,6,67,8,9,13,29,9,10,11,12,8,10,15,15,11,6,9,3,7,8,10,17,4,3,7,49,7,9,14,9,4,10,7,7,7,9,11,11,5,14,14,11,6,13,16,15,14,9,9,15,8,9,11,8,15
    ...             3,GCA,2,3,8,3,34,11,7,6,7,6,9,8,4,3,5,0,6,14,10,12,6,8,7,10,5,11,7,6,6,1,3,12,19,6,11,9,333250,10,6,9,15,3,5,5,37,9,9,7,8,4,8,3,23,5,7,8,6,11,7,10,7,9,3,6 
    ...             '''.strip()))
    ...     f.flush()
    ...     nt_counts = codon_to_nt_counts(f.name)
    >>> nt_counts
       site wildtype       A       C       G       T
    0     1        A  334009       0       6       0
    1     2        T       0       2       0  334013
    2     3        G       8       0  333993      14
    3     4        A  333424     156     169     187
    4     5        A  333361     211     186     178
    5     6        G     156     185  333427     168
    6     7        G     116     124  333410     125
    7     8        C     126  333407     121     121
    8     9        A  333435     114     112     114

    """
    if not isinstance(codoncounts, pandas.DataFrame):
        codoncounts = pandas.read_csv(codoncounts)

    if codoncounts['site'].dtype != int:
        raise ValueError('`site` column in `codoncounts` must be integer')

    nt_counts = []
    for i_nt in [0, 1, 2]:
        nt_counts.append(
                codoncounts
                .melt(id_vars=['site', 'wildtype'],
                      var_name='codon',
                      value_name='count',
                      )
                .assign(
                    site=lambda x: 3 * (x['site'] - 1) + i_nt + 1,
                    wildtype=lambda x: x['wildtype'].str[i_nt],
                    nucleotide=lambda x: x['codon'].str[i_nt],
                    )
                .groupby(['site', 'wildtype', 'nucleotide'])
                .aggregate({'count': 'sum'})
                .reset_index()
                .pivot_table(values='count',
                             columns='nucleotide',
                             index=['site', 'wildtype'])
                .reset_index()
                )

    nt_counts = (pandas.concat(nt_counts)
                 .sort_values('site')
                 .reset_index(drop=True)
                 )
    del nt_counts.columns.name
    return nt_counts


[docs]def barcodeInfoToCodonVariantTable(samples, geneseq, path=None):
    """Convert barcode info files into a CodonVariantTable

    Convert barcode info files output from `dms2_bcsubamp` into a
    `CodonVariantTable`. Barcode info files contain reads and barcodes from
    barcoded subamplicon sequencing, described 
    `here <https://jbloomlab.github.io/dms_tools2/bcsubamp.html>`_.
    This function takes consensus reads retained by `dms2_bcsubamp`, 
    gives each unique sequence a numerical barcode (since the barcodes from 
    `dms2_bcsubamp` could come from the same variant), and counts the number
    of retained consensus reads corresponding to each sequence. Then, a
    `CodonVariantTable` is made using the sequences and their numerical 
    barcodes, and counts are added based on the number of retained consensus
    reads of those sequences. Therefore, the `CodonVariantTable` will only 
    contain one 'variant' for each unique sequence with the total count for all
    the unbarcoded variants in the experiment which had the same sequence.

    Args:
        `samples` (dict):
            Dictionary with libraries as keys and lists of info file prefixes
            (file names without the '_bcinfo.txt.gz') for files corresponding
            to those libraries as values.
            
            Example: {'library-1':['condition-1-library-1'],
                      'library-2':['condition-1-library-2']}
        `geneseq` (str):
            The wildtype gene sequence
        `path` (str)
            Directory in which barcode info files are located

    Returns:
        A `dms_variants.codonvarianttable.CodonVariantTable` with 'counts'
        generated from the barcode info files
    """

    # Set up re matchers for looking at lines
    matcher = re.compile(r'(?P<linetype>^.*\:) '
                         r'(?P<contents>.*$)')

    alt_matcher = re.compile(r'(?P<linetype>^R\d READS:$)')

    read_matcher = re.compile(r'(?P<read>^[ATGCN\s]*$)')

    # Create a dictionary to contain dictionaries of each library's barcodes
    libraries = {}

    # Initialize lists for making the codonvarianttable
    barcodes = []
    subs = []
    variant_call_support = []
    library_list = []

    # For each library, go through each sample file and collect data
    for library in samples.keys():
        # Initialize dictionary to contain this library's reads and barcodes
        barcode_dictionary = {}
        # Start a barcode count for this library
        cur_barcode = 1
        # For each barcode info file corresponding to a sample in this library
        for sample in samples[library]:

            # Set initial conditions
            take_next = False
            description_skipped = False

            # Find the file
            f = f"{sample}_bcinfo.txt.gz"
            if path:
                file_path = os.path.join(os.path.abspath(path), f)
            else:
                file_path = f

            # Open the file and loop through it to find retained consensus
            # reads and give them each a new barcode
            with gzip.open(file_path, 'r') as f:
                # Make sure the first line looks like it is supposed to
                firstline = f.readline()
                firstline = firstline.decode()
                first_match = matcher.match(firstline)
                if first_match.group('linetype') != 'BARCODE:':
                    raise ValueError(f"Unexpected first line {firstline}: may be "
                    "unexpected file type")
                else:
                    previous_line = first_match

                # Go through the lines, making they are in the expected order
                for line in f:
                    line = line.decode()
                    line_match = matcher.match(line)
                    if not line_match:
                        line_match = alt_matcher.match(line)
                    if not line_match:
                        read_match = read_matcher.match(line)
                        if not read_match:
                            raise ValueError(f"Unable to recognize line {line}")
                        else:
                            line_is_read = True
                            previous_linetype = previous_line.group('linetype')
                            if previous_linetype != 'R1 READS:' and \
                               previous_linetype != 'R2 READS:':
                               raise ValueError(f"Unexpected line {line}")
                    else:
                        line_is_read = False
                    if previous_line.group('linetype') == 'BARCODE:':
                        if line_match.group('linetype') != 'RETAINED:':
                            raise ValueError(f"Unexpected line {line}")
                        # Decide whether to retain the next consensus or not
                        else:
                            if line_match.group('contents') == 'False':
                                retain = False
                            elif line_match.group('contents') == 'True':
                                retain = True
                            else:
                                raise ValueError(f"Unexpected line {line}")
                    elif previous_line.group('linetype') == 'RETAINED:':
                        if line_match.group('linetype') != 'DESCRIPTION:':
                            raise ValueError(f"Unexpected line {line}")
                    elif previous_line.group('linetype') == 'DESCRIPTION:':
                        if line_match.group('linetype') != 'CONSENSUS:':
                            raise ValueError(f"Unexpected line {line}")
                        # Make sure we know whether to retain or not
                        elif not isinstance(retain, bool):
                            raise ValueError(
                            f"Unclear whether to retain {line_match.group('contents')}"
                            )
                        elif retain:
                            read = line_match.group('contents')
                            # Add the read to the dictionary if not in it
                            # Also give it a barcode
                            if 'N' not in read:
                                if read not in barcode_dictionary:
                                    # Create the sequence in the dictionary
                                    barcode_dictionary[read] = {}
                                    # Give it an initial count of 1 for this sample
                                    barcode_dictionary[read][sample] = 1
                                    # Give it the next barcode
                                    barcode_dictionary[read]['barcode'] = cur_barcode
                                    # Save values for making CodonVariantTable
                                    barcodes.append(cur_barcode)
                                    subs.append(getSubstitutions(geneseq, read))
                                    variant_call_support.append(1)
                                    library_list.append(library)
                                    # Advance current barcode
                                    cur_barcode += 1
                                else:
                                    # Add a counter for the sample if sequence
                                    # not seen for this sample yet
                                    if sample not in barcode_dictionary[read]:
                                        barcode_dictionary[read][sample] = 1
                                    else:
                                        # Add another count to this read for
                                        # this sample
                                        barcode_dictionary[read][sample] += 1
                        # Set retain to None
                        retain = None
                    elif previous_line.group('linetype') == 'CONSENSUS:':
                        if line_match.group('linetype') != 'R1 READS:':
                            raise ValueError(f"Unexpected line {line}")
                    elif previous_line.group('linetype') == 'R1 READS:':
                        if not line_is_read:
                            if line_match.group('linetype') != 'R2 READS:':
                                raise ValueError(f"Unexpected line {line}")
                    elif previous_line.group('linetype') == 'R2 READS:':
                        if not line_is_read:
                            if line_match.group('linetype') != 'BARCODE:':
                                raise ValueError(f"Unexpected line {line}")
                    # Save this line as the previous line if it is not a read
                    if not line_is_read:
                        previous_line = line_match
        # After going through each file for a library, save its dictionary with
        # reads and barcodes
        libraries[library] = barcode_dictionary

    # Make the dataframe for creating the codonvarianttable
    df =  {'barcode':barcodes,
            'substitutions':subs,
            'library':library_list,
            'variant_call_support':variant_call_support,
           }
    df = pandas.DataFrame(df)

    # Make the codonvarianttable
    with tempfile.NamedTemporaryFile(mode='w') as f:
        df.to_csv(f, index=False)
        f.flush()
        variants = dms_variants.codonvarianttable.CodonVariantTable(
                    barcode_variant_file=f.name,
                    geneseq=geneseq)

    # Make the counts dataframe:
    # Initialize list of dataframes
    dfs = []

    # Loop through each library and produce count dataframes for each sample
    for library in libraries:
        barcode_dictionary = libraries[library]
        for sample in samples[library]:
            barcodes_list = []
            counts_list = []
            sample_list = []
            library_list = []
            # Get counts for this sample
            for sequence in barcode_dictionary.keys():
                if sample not in barcode_dictionary[sequence].keys():
                    counts_list.append(0)
                else:
                    counts_list.append(barcode_dictionary[sequence][sample])
                barcodes_list.append(barcode_dictionary[sequence]['barcode'])
                sample_list.append(sample)
                library_list.append(library)
            # Make a dataframe for this sample
            data = {'barcode':barcodes_list,
                    'count':counts_list,
                    'sample':sample_list,
                    'library':library_list,
                   }
            data = pandas.DataFrame(data)
            # Append it to the list of dataframes
            dfs.append(data)
        # Concatenate the list of dataframes into a counts dataframe
        barcode_counts = pandas.concat(dfs)
    # Add the counts for each sample to the codonvarianttable
    for library in libraries:
        for sample in samples[library]:
            icounts = barcode_counts.query('library == @library & sample == @sample')
            icounts = icounts[['barcode', 'count']]
            variants.addSampleCounts(library, sample, icounts)

    return(variants)

if __name__ == '__main__':
    import doctest
    doctest.testmod()
Source code for dms_tools2.utils

dms_tools2

Navigation

Related Topics