Source code for dms_tools2.parseargs

"""
===================
parseargs
===================

Argument parsing for the executable scripts in ``dms_tools2``.
"""


import re
import argparse
import dms_tools2


[docs]def checkName(name, nametype):
    """Check if `name` is an allowable name.

    Allowed names can contain most characters but **not**
    LaTex special characters.

    Args:
        `name` (str)
            Name to check
        `nametype` (str)
            If we print an exception what do we call the parameter
            that failed? For instance, `name` or `group`.

    Returns:
        Returns `True` if `name` is an allowable name.
        Otherwise raises a `ValueError` explaining why the
        `name` is invalid.

    >>> checkName('sample-1', 'name')
    True

    >>> checkName('sample 1', 'name')
    True

    >>> checkName('PGT151 - 5 nM', 'name')
    True

    >>> checkName('sample_1', 'name')
    Traceback (most recent call last):
        ...
    ValueError: name sample_1 contains following illegal characters: _

    >>> checkName('sample_1', 'group')
    Traceback (most recent call last):
        ...
    ValueError: group sample_1 contains following illegal characters: _
    """
    if not name or name.isspace():
        raise ValueError("{0} is all whitespace".format(nametype))
    illegal_chars = [c for c in name if 
            re.search(r'^[a-zA-Z0-9\- \.]$', c) is None]
    if illegal_chars:
        raise ValueError("{0} {1} contains following illegal characters: "
                "{2}".format(nametype, name, ', '.join(illegal_chars)))
    return True

[docs]def parentParser():
    """Returns parent parser with some common options added.

    Returns:
        `argparse.ArgumentParser` with the following arguments
        already added: 

            - ``--outdir``

            - ``--ncpus``

            - ``--use_existing``

            - ``-v / --version``
    """
    parser = argparse.ArgumentParser(add_help=False)

    parser.add_argument('--outdir',  
            help='Output files to this directory (create if needed).')

    parser.add_argument('--ncpus', type=int, default=-1, 
            help="Number of CPUs to use, -1 is all available.")

    parser.add_argument('--use_existing', choices=['yes', 'no'],
            default='no', help=('If files with names of expected '
            'output already exist, do not re-run.'))

    parser.add_argument('-v', '--version', action='version', 
            version='%(prog)s {0}'.format(dms_tools2.__version__))

    return parser


[docs]def parserDescription(description):
    """Augments description with program information.

    Args:
        `description` (str)
            Description of program

    Returns:
        A string with `description` augmented with information
        on the `dms_tools2` package / version.
    """
    return ("{0} Part of `{1} <{4}>`_ (version {2}) written by {3}."
            .format(description, dms_tools2.__name__,
            dms_tools2.__version__, dms_tools2.__author__,
            dms_tools2.__url__))


[docs]def bcsubampParentParser():
    """Parent parser for ``dms2_bcsubamp`` / ``dms2_batch_bcsubamp``."""
    parser = argparse.ArgumentParser(
            parents=[parentParser()],
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--refseq', required=True, 
            help='Align subamplicons to gene in this FASTA file.')

    parser.add_argument('--alignspecs', required=True, nargs='+',
            help=("Subamplicon alignment positions as "
            "'REFSEQSTART,REFSEQEND,R1START,R2START'. "
            "REFSEQSTART is nt (1, 2, ... numbering) in "
            "'refseq' where nt R1START in R1 aligns. "
            "REFSEQEND is nt in 'refseq' where nt R2START "
            "in R2 aligns.'"))

    parser.add_argument('--bclen', type=int, default=8,
            help='Length of NNN... barcode at start of each read. '
                 'Assumed to be same for R1 and R2, use `--bclen2` '
                 'if this is not the case.')

    parser.add_argument('--fastqdir',
            help='R1 and R2 files in this directory.')

    parser.add_argument('--R2', nargs='+', help=("Read 2 (R2) FASTQ "
            "files assumed to have same names as R1 but with "
            "'_R1' replaced by '_R2'. If that is not case, provide "
            "names here."))

    parser.add_argument('--R1trim', type=int, nargs='+',
        help=("Trim R1 from 3' end to this length. One value for all "
        "reads or values for each subamplicon in ``--alignspecs``."))

    parser.add_argument('--R2trim', type=int, nargs='+',
        help="Like '--R1trim', but for R2.")

    parser.add_argument('--bclen2', type=int, help='If R1 and R2 have '
            'different length barcodes, use `--bclen` for R1 length '
            'and `--bclen2` for R2 length.')

    parser.add_argument('--chartype', default='codon', choices=['codon'],
            help='Character type for which we count mutations.')

    parser.add_argument('--maxmuts', type=int, default=4, 
            help=("Max allowed mismatches in alignment of subamplicon; "
            "mismatches counted in terms of character '--chartype'."))

    parser.add_argument('--minq', type=int, default=15,
            help="Only call nucleotides with Q score >= this.")

    parser.add_argument ('--minreads', type=int, default=2, 
            help=("Require this many reads in a barcode to agree to "
            "call consensus nucleotide identity."))

    parser.add_argument('--minfraccall', type=float, default=0.95, 
            help=("Retain only barcodes where trimmed consensus "
            "sequence for each read has >= this frac sites called."))

    parser.add_argument('--minconcur', default=0.75, type=float,
            help=('Only call consensus identity for barcode when >= '
            'this fraction of reads concur.'))

    parser.add_argument('--sitemask', help='Use to only consider '
            'mutations at a subset of sites. Should be a CSV file '
            'with column named `site` listing all sites to include.')

    parser.add_argument('--purgeread', type=float, default=0,
            help=("Randomly purge read pairs with this probability "
            "to subsample data."))

    parser.add_argument('--purgebc', type=float, default=0, 
            help=("Randomly purge barcodes with this probability to "
            "subsample data."))

    parser.set_defaults(bcinfo=False, bcinfo_csv=False)
    parser.add_argument('--bcinfo', dest='bcinfo', action='store_true',
            help=("Create file with suffix 'bcinfo.txt.gz' with info "
            "about each barcode."))
    parser.add_argument('--bcinfo_csv', dest='bcinfo_csv',
            action='store_true', help=("Store 'bcinfo' file as a csv "
            "with the suffix 'bcinfo.csv.gz'. Only has an effect if "
            "`--bcinfo` is used."))

    return parser


[docs]def bcsubampParser():
    """Returns `argparse.ArgumentParser` for ``dms2_bcsubamp``."""
    parser = argparse.ArgumentParser(
            description=parserDescription(
                'Align barcoded subamplicons and count mutations.'),
            parents=[bcsubampParentParser()],
            conflict_handler='resolve',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--name', required=True, 
            help='Sample name used for output files.')

    parser.add_argument('--R1', required=True, nargs='+',
            help="Read 1 (R1) FASTQ files, can be gzipped. "
            "See also '--fastqdir'.")

    return parser


[docs]def batch_bcsubampParser():
    """Returns `argparse.ArgumentParser` for ``dms2_batch_bcsubamp``."""
    parser = argparse.ArgumentParser(
            description=parserDescription(
                'Perform many runs of ``dms2_bcsubamp`` and plot results.'),
            parents=[bcsubampParentParser()],
            conflict_handler='resolve',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--batchfile', help="CSV file specifying each "
            "``dms2_bcsubamp`` run. Must have these columns: "
            "`name`, `R1`. Can optionally have columns `R1trim` and "
            "`R2trim` with spaces delimiting subamplicon-specific trimming. "
            "If `R1trim` / `R2trim` in batch file, do **not** "
            "also give values for ``--R1trim`` and ``--R2trim``. "
            "Other columns are ignored, so other "
            "``dms2_bcsubamp`` args should be passed as separate "
            "command line args rather than in ``--batchfile``.", 
            required=True)

    parser.add_argument('--summaryprefix', required=True,
            help="Prefix of output summary plots.")

    return parser


[docs]def prefsParentParser():
    """Parent parser for ``dms2_prefs`` / ``dms2_batch_prefs``."""
    parser = argparse.ArgumentParser(
            parents=[parentParser()],
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--method', default='bayesian', 
            choices=['ratio', 'bayesian'], help="Method to "
            "estimate preferences: normalized enrichment ratios "
            "or Bayesian inference.")

    parser.add_argument('--indir', help="Input counts files in this "
            "directory.")

    parser.add_argument('--chartype', default='codon_to_aa',
            choices=['codon_to_aa'], help="Characters for which "
            "preferences are estimated. `codon_to_aa` = amino acids "
            "from codon counts.")

    parser.add_argument('--excludestop', default='yes', choices=['yes', 'no'],
            help="Exclude stop codons as a possible amino acid?")

    parser.add_argument('--conc', nargs=3, default=[1, 1, 1],
            type=float, metavar=('Cprefs', 'Cmut', 'Cerr'),
            help="Concentration parameters for priors for "
            "``--method bayesian``. Priors are over preferences, "
            "mutagenesis rate, and error rate(s).")

    parser.add_argument('--pseudocount', default=1,
            help="Pseudocount used with ``--method ratio``.")

    return parser


[docs]def prefsParser():
    """Returns `argparse.ArgumentParser` for ``dms2_prefs``."""
    parser = argparse.ArgumentParser(
            description=parserDescription(
                'Estimate preferences from mutation counts.'),
            parents=[prefsParentParser()],
            conflict_handler='resolve',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--pre', required=True, 
            help='Pre-selection counts file or prefix used when creating '
            'this file.')

    parser.add_argument('--post', required=True, 
            help='Like ``--pre`` but for post-selection counts.')

    parser.add_argument('--name', required=True, 
            help='Name used for output files.')

    parser.add_argument('--err', nargs=2, metavar=('ERRPRE', 'ERRPOST'),
            help="Like ``--pre`` but for counts for error control(s) for "
            "``--pre`` and ``--post``. Specify same file twice for same "
            "control for both.")

    return parser


[docs]def batch_prefsParser():
    """Returns `argparse.ArgumentParser` for ``dms2_batch_prefs``."""
    parser = argparse.ArgumentParser(
            description=parserDescription(
                'Perform many runs of ``dms2_prefs`` and summarize results.'),
            parents=[prefsParentParser()],
            conflict_handler='resolve',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--batchfile', help="CSV file specifying each "
            "``dms2_prefs`` run. Must have these columns: "
            "`name`, `pre`, `post`. Can also have these columns: "
            "`err` or `errpre` and `errpost`. Other columns are ignored, "
            "so other ``dms2_prefs`` args should be passed as separate "
            "command line args rather than in ``--batchfile``.",
            required=True)

    parser.add_argument('--summaryprefix', required=True,
            help="Prefix of output summary files and plots.")

    parser.set_defaults(no_corr=False)
    parser.add_argument('--no_corr', dest='no_corr', action='store_true',
            help='Do not create correlation plot.')

    parser.set_defaults(no_avg=False)
    parser.add_argument('--no_avg', dest='no_avg', action='store_true',
            help='Do not create average prefs CSV.')

    return parser


[docs]def diffselParentParser():
    """Parent parser for ``dms2_diffsel`` / ``dms2_batch_diffsel``."""
    parser = argparse.ArgumentParser(
            parents=[parentParser()],
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--indir', help="Input counts files in this "
            "directory.")

    parser.add_argument('--chartype', default='codon_to_aa',
            choices=['codon_to_aa'], help="Characters for which "
            "differential selection is estimated. `codon_to_aa` = amino "
            "acids from codon counts.")

    parser.add_argument('--excludestop', default='yes', choices=['yes', 'no'],
            help="Exclude stop codons as a possible amino acid?")

    parser.add_argument('--pseudocount', default=5, type=float,
            help="Pseudocount added to each count for sample with smaller "
            "depth; pseudocount for other sample scaled by relative depth.")

    parser.add_argument('--mincount', default=0, type=float,
            help="Report as `NaN` the diffsel of mutations for which both "
            "selected and mock-selected samples have < this many counts.")

    return parser


[docs]def diffselParser():
    """Returns `argparse.ArgumentParser` for ``dms2_diffsel``."""
    parser = argparse.ArgumentParser(
            description=parserDescription(
                'Estimate differential selection from mutation counts.'),
            parents=[diffselParentParser()],
            conflict_handler='resolve',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--name', required=True, 
            help='Name used for output files.')

    parser.add_argument('--sel', required=True, help="Post-selection "
            "counts file or prefix used when creating this file.")

    parser.add_argument('--mock', required=True, help="Like ``--sel``, "
            "but for mock-selection counts.")

    parser.add_argument('--err', help="Like ``--sel`` but for "
            "error-control to correct mutation counts.")

    return parser


[docs]def batch_diffselParser():
    """Returns `argparse.ArgumentParser` for ``dms2_batch_diffsel``."""
    parser = argparse.ArgumentParser(
            description=parserDescription(
                'Perform many runs of ``dms2_diffsel`` and summarize results.'),
            parents=[diffselParentParser()],
            conflict_handler='resolve',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--batchfile', required=True, help='CSV file '
            'specifying each ``dms2_diffsel`` run. Must have these '
            'columns: `name`, `sel`, `mock`. Can also have these: '
            '`err`, `group`, `grouplabel`. If `group` is used, '
            'samples are grouped in summary plots labeled by `group`, '
            'or by `grouplabel` if specified. Other columns are ignored, '
            'so other ``dms2_diffsel`` args should be passed as separate '
            'command line args rather than in ``--batchfile``.')

    parser.add_argument('--summaryprefix', required=True,
            help='Prefix of output summary files and plots.')

    return parser


[docs]def fracsurviveParser():
    """Returns `argparse.ArgumentParser` for ``dms2_fracsurvive``."""
    parser = argparse.ArgumentParser(
            description=parserDescription(
                'Estimate fraction surviving for each mutation.'),
            parents=[fracsurviveParentParser()],
            conflict_handler='resolve',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--name', required=True, 
            help='Name used for output files.')

    parser.add_argument('--sel', required=True, help="Post-selection "
            "counts file or prefix used when creating this file.")

    parser.add_argument('--mock', required=True, help="Like ``--sel``, "
            "but for mock-selection counts.")

    parser.add_argument('--libfracsurvive', required=True, type=float,
            help='Overall fraction of total library surviving selection '
            'versus mock condition. Should be between 0 and 1.')

    parser.add_argument('--err', help="Like ``--sel`` but for "
            "error-control to correct mutation counts.")

    return parser


[docs]def batch_fracsurviveParser():
    """Returns `argparse.ArgumentParser` for ``dms2_batch_fracsurvive``"""
    parser = argparse.ArgumentParser(
            description=parserDescription(
                'Perform runs of ``dms2_fracsurvive`` and summarize results.'),
            parents=[fracsurviveParentParser()],
            conflict_handler='resolve',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--batchfile', required=True, help='CSV file '
            'specifying each ``dms2_fracsurvive`` run. Must have these '
            'columns: `name`, `sel`, `mock`, `libfracsurvive`. Can also ' 
            'have these `err`, `group`, `grouplabel`. If `group` is used, '
            'samples are grouped in summary plots labeled by `group`, or by '
            '`grouplabel` if it is specified. Other columns are ignored, so '
            'other ``dms2_fracsurvive`` args should be passed as separate '
            'command line args rather than in ``--batchfile``.')

    parser.add_argument('--summaryprefix', required=True,
            help='Prefix of output summary files and plots.')

    return parser


[docs]def fracsurviveParentParser():
    """Parent parser ``dms2_fracsurvive`` / ``dms2_batch_fracsurvive``"""
    parser = argparse.ArgumentParser(
            parents=[parentParser()],
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--indir', help="Input counts files in this "
            "directory.")

    parser.add_argument('--chartype', default='codon_to_aa',
            choices=['codon_to_aa'], help="Characters for which "
            "fraction surviving selection is estimated. `codon_to_aa` ="
            " amino acids from codon counts.")

    parser.add_argument('--aboveavg', default='no', choices=['yes', 'no'],
            help="Report fracsurvive **above** the library average "
            "rather than direct fracsurvive values.")

    parser.add_argument('--excludestop', default='yes', choices=['yes', 'no'],
            help="Exclude stop codons as a possible amino acid?")

    parser.add_argument('--pseudocount', default=5, type=float,
            help="Pseudocount added to each count for sample with smaller "
            "depth; pseudocount for other sample scaled by relative depth.")

    parser.add_argument('--mincount', default=0, type=float,
            help="Report as `NaN` the fracsurvive of mutations for which "
            "both selected and mock-selected samples have < this many counts.")

    return parser



[docs]def logoplotParser():
    """Returns `argparse.ArgumentParser` for ``dms2_logoplot``."""
    parser = argparse.ArgumentParser(
            description=parserDescription(
                'Create logo plot visualization.'),
            parents=[parentParser()],
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--prefs', help='CSV file of amino-acid preferences.')
    group.add_argument('--diffsel', help='CSV file of amino-acid '
            'differential selection.')
    group.add_argument('--fracsurvive', help='CSV file of amino-acid '
            'fraction surviving.')
    group.add_argument('--diffprefs', help='CSV file of differences in '
            'amino-acid preferences.')
    group.add_argument('--muteffects', help='CSV file of amino-acid '
            'mutational effects.')

    parser.add_argument('--name', required=True, 
            help='Name used for output files.')

    parser.add_argument('--nperline', help='Number of sites per line.',
            type=int, default=70)

    parser.add_argument('--numberevery', type=int, default=10,
            help='Number sites at this interval.')

    parser.add_argument('--excludestop', choices=['yes', 'no'],
            default='no', help='Exclude stop codons as possible amino '
            'acid?')

    parser.add_argument('--stringency', type=float, default=1,
            help='Stringency parameter to re-scale prefs.')

    parser.add_argument('--restrictdiffsel', default='all',
            choices=['all', 'positive', 'negative'], 
            help='Plot all diffsel, or only positive or negative.')

    parser.add_argument('--diffselrange', nargs=2, type=float,
            metavar=('MINDIFFSEL', 'MAXDIFFSEL'), 
            help='Specify a fixed range for `diffsel`. Otherwise '
            'determined from data range.')

    parser.add_argument('--muteffectrange', nargs=2, type=float,
            metavar=('MINMUTEFFECT', 'MAXMUTEFFECT'),
            help='Specify a fixed range for `muteffects`. Otherwise '
            'determined from data range.')

    parser.add_argument('--fracsurvivemax', type=float,
            help='Specify maximum value for `fracsurvive`. '
            'Otherwise determined from data range.')

    parser.add_argument('--sortsites', choices=['yes', 'no'],
            default='yes', help='Sort sites from first to last '
            'before plotting.')

    parser.add_argument('--mapmetric', default='functionalgroup', choices=[
            'kd', 'mw', 'charge', 'functionalgroup', 'singlecolor'],
            help='Color amino acids '
            'by Kyte-Doolittle hydrophobicity, molecular weight, charge, '
            'or functional group.')

    parser.add_argument('--colormap', default='jet', help="`matplotlib "
            "color map <http://matplotlib.org/users/colormaps.html>`_ for"
            " amino acids when `--mapmetric` is 'kd' or 'mw'; name of "
            "single color when it is 'singlecolor'.")

    parser.add_argument('--overlay1', nargs=3,
            metavar=('FILE', 'SHORTNAME', 'LONGNAME'),
            help="Color bar above logo plot to denote per-residue "
            "property. FILE is CSV format with column names `site` "
            "and SHORTNAME. SHORTNAME is <= 5 character property name. "
            "LONGNAME is longer name for legend. Sites not in FILE "
            "are colored white. To show wildtype identity, make "
            "SHORTNAME and LONGNAME both `wildtype` and have this "
            "column in FILE give 1-letter wildtype amino-acid code. "
            "To show ``omegabysite.txt`` file from ``phydms``, "
            "give that file and set both SHORTNAME and LONGNAME "
            "to `omegabysite`.")

    parser.add_argument('--overlay2', default=None, nargs=3,
            metavar=('FILE', 'SHORTNAME', 'LONGNAME'),
            help='Second overlay color bar.')

    parser.add_argument('--overlay3', default=None, nargs=3,
            metavar=('FILE', 'SHORTNAME', 'LONGNAME'),
            help='Third overlay color bar.')

    parser.add_argument('--underlay', default='no', choices=['yes', 'no'],
            help='Plot underlay rather than overlay bars.')

    parser.add_argument('--scalebar', default=None, nargs=2,
            metavar=('BARHEIGHT', 'LABEL'), help='Plot a scale bar '
            'indicating BARHEIGHT with LABEL. Only for `diffsel`, '
            '`fracsurvive`, and `muteffects`.')

    parser.add_argument('--overlaycolormap', default='jet', help="`matplotlib "
            "color map <http://matplotlib.org/users/colormaps.html>`_ for"
            " overlay bars (e.g., 'jet' or 'YlOrRd').")

    parser.add_argument('--letterheight', type=float, default=1,
            help="Relative height of letter stacks in logo plot.")

    parser.add_argument('--ignore_extracols', default='no',
            choices=['yes', 'no'], help='Ignore extra columns in data')

    parser.add_argument('--sepline', default='yes', choices=['yes', 'no'],
            help="Separate positive and negative diffsel with black line?")

    return parser



if __name__ == '__main__':
    import doctest
    doctest.testmod()
Source code for dms_tools2.parseargs

dms_tools2

Navigation

Related Topics