"""
===================
parseargs
===================
Argument parsing for the executable scripts in ``dms_tools2``.
"""
import re
import argparse
import dms_tools2
[docs]def checkName(name, nametype):
"""Check if `name` is an allowable name.
Allowed names can contain most characters but **not**
LaTex special characters.
Args:
`name` (str)
Name to check
`nametype` (str)
If we print an exception what do we call the parameter
that failed? For instance, `name` or `group`.
Returns:
Returns `True` if `name` is an allowable name.
Otherwise raises a `ValueError` explaining why the
`name` is invalid.
>>> checkName('sample-1', 'name')
True
>>> checkName('sample 1', 'name')
True
>>> checkName('PGT151 - 5 nM', 'name')
True
>>> checkName('sample_1', 'name')
Traceback (most recent call last):
...
ValueError: name sample_1 contains following illegal characters: _
>>> checkName('sample_1', 'group')
Traceback (most recent call last):
...
ValueError: group sample_1 contains following illegal characters: _
"""
if not name or name.isspace():
raise ValueError("{0} is all whitespace".format(nametype))
illegal_chars = [c for c in name if
re.search(r'^[a-zA-Z0-9\- \.]$', c) is None]
if illegal_chars:
raise ValueError("{0} {1} contains following illegal characters: "
"{2}".format(nametype, name, ', '.join(illegal_chars)))
return True
[docs]def parentParser():
"""Returns parent parser with some common options added.
Returns:
`argparse.ArgumentParser` with the following arguments
already added:
- ``--outdir``
- ``--ncpus``
- ``--use_existing``
- ``-v / --version``
"""
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('--outdir',
help='Output files to this directory (create if needed).')
parser.add_argument('--ncpus', type=int, default=-1,
help="Number of CPUs to use, -1 is all available.")
parser.add_argument('--use_existing', choices=['yes', 'no'],
default='no', help=('If files with names of expected '
'output already exist, do not re-run.'))
parser.add_argument('-v', '--version', action='version',
version='%(prog)s {0}'.format(dms_tools2.__version__))
return parser
[docs]def parserDescription(description):
"""Augments description with program information.
Args:
`description` (str)
Description of program
Returns:
A string with `description` augmented with information
on the `dms_tools2` package / version.
"""
return ("{0} Part of `{1} <{4}>`_ (version {2}) written by {3}."
.format(description, dms_tools2.__name__,
dms_tools2.__version__, dms_tools2.__author__,
dms_tools2.__url__))
[docs]def bcsubampParentParser():
"""Parent parser for ``dms2_bcsubamp`` / ``dms2_batch_bcsubamp``."""
parser = argparse.ArgumentParser(
parents=[parentParser()],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--refseq', required=True,
help='Align subamplicons to gene in this FASTA file.')
parser.add_argument('--alignspecs', required=True, nargs='+',
help=("Subamplicon alignment positions as "
"'REFSEQSTART,REFSEQEND,R1START,R2START'. "
"REFSEQSTART is nt (1, 2, ... numbering) in "
"'refseq' where nt R1START in R1 aligns. "
"REFSEQEND is nt in 'refseq' where nt R2START "
"in R2 aligns.'"))
parser.add_argument('--bclen', type=int, default=8,
help='Length of NNN... barcode at start of each read. '
'Assumed to be same for R1 and R2, use `--bclen2` '
'if this is not the case.')
parser.add_argument('--fastqdir',
help='R1 and R2 files in this directory.')
parser.add_argument('--R2', nargs='+', help=("Read 2 (R2) FASTQ "
"files assumed to have same names as R1 but with "
"'_R1' replaced by '_R2'. If that is not case, provide "
"names here."))
parser.add_argument('--R1trim', type=int, nargs='+',
help=("Trim R1 from 3' end to this length. One value for all "
"reads or values for each subamplicon in ``--alignspecs``."))
parser.add_argument('--R2trim', type=int, nargs='+',
help="Like '--R1trim', but for R2.")
parser.add_argument('--bclen2', type=int, help='If R1 and R2 have '
'different length barcodes, use `--bclen` for R1 length '
'and `--bclen2` for R2 length.')
parser.add_argument('--chartype', default='codon', choices=['codon'],
help='Character type for which we count mutations.')
parser.add_argument('--maxmuts', type=int, default=4,
help=("Max allowed mismatches in alignment of subamplicon; "
"mismatches counted in terms of character '--chartype'."))
parser.add_argument('--minq', type=int, default=15,
help="Only call nucleotides with Q score >= this.")
parser.add_argument ('--minreads', type=int, default=2,
help=("Require this many reads in a barcode to agree to "
"call consensus nucleotide identity."))
parser.add_argument('--minfraccall', type=float, default=0.95,
help=("Retain only barcodes where trimmed consensus "
"sequence for each read has >= this frac sites called."))
parser.add_argument('--minconcur', default=0.75, type=float,
help=('Only call consensus identity for barcode when >= '
'this fraction of reads concur.'))
parser.add_argument('--sitemask', help='Use to only consider '
'mutations at a subset of sites. Should be a CSV file '
'with column named `site` listing all sites to include.')
parser.add_argument('--purgeread', type=float, default=0,
help=("Randomly purge read pairs with this probability "
"to subsample data."))
parser.add_argument('--purgebc', type=float, default=0,
help=("Randomly purge barcodes with this probability to "
"subsample data."))
parser.set_defaults(bcinfo=False, bcinfo_csv=False)
parser.add_argument('--bcinfo', dest='bcinfo', action='store_true',
help=("Create file with suffix 'bcinfo.txt.gz' with info "
"about each barcode."))
parser.add_argument('--bcinfo_csv', dest='bcinfo_csv',
action='store_true', help=("Store 'bcinfo' file as a csv "
"with the suffix 'bcinfo.csv.gz'. Only has an effect if "
"`--bcinfo` is used."))
return parser
[docs]def bcsubampParser():
"""Returns `argparse.ArgumentParser` for ``dms2_bcsubamp``."""
parser = argparse.ArgumentParser(
description=parserDescription(
'Align barcoded subamplicons and count mutations.'),
parents=[bcsubampParentParser()],
conflict_handler='resolve',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--name', required=True,
help='Sample name used for output files.')
parser.add_argument('--R1', required=True, nargs='+',
help="Read 1 (R1) FASTQ files, can be gzipped. "
"See also '--fastqdir'.")
return parser
[docs]def batch_bcsubampParser():
"""Returns `argparse.ArgumentParser` for ``dms2_batch_bcsubamp``."""
parser = argparse.ArgumentParser(
description=parserDescription(
'Perform many runs of ``dms2_bcsubamp`` and plot results.'),
parents=[bcsubampParentParser()],
conflict_handler='resolve',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--batchfile', help="CSV file specifying each "
"``dms2_bcsubamp`` run. Must have these columns: "
"`name`, `R1`. Can optionally have columns `R1trim` and "
"`R2trim` with spaces delimiting subamplicon-specific trimming. "
"If `R1trim` / `R2trim` in batch file, do **not** "
"also give values for ``--R1trim`` and ``--R2trim``. "
"Other columns are ignored, so other "
"``dms2_bcsubamp`` args should be passed as separate "
"command line args rather than in ``--batchfile``.",
required=True)
parser.add_argument('--summaryprefix', required=True,
help="Prefix of output summary plots.")
return parser
[docs]def prefsParentParser():
"""Parent parser for ``dms2_prefs`` / ``dms2_batch_prefs``."""
parser = argparse.ArgumentParser(
parents=[parentParser()],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--method', default='bayesian',
choices=['ratio', 'bayesian'], help="Method to "
"estimate preferences: normalized enrichment ratios "
"or Bayesian inference.")
parser.add_argument('--indir', help="Input counts files in this "
"directory.")
parser.add_argument('--chartype', default='codon_to_aa',
choices=['codon_to_aa'], help="Characters for which "
"preferences are estimated. `codon_to_aa` = amino acids "
"from codon counts.")
parser.add_argument('--excludestop', default='yes', choices=['yes', 'no'],
help="Exclude stop codons as a possible amino acid?")
parser.add_argument('--conc', nargs=3, default=[1, 1, 1],
type=float, metavar=('Cprefs', 'Cmut', 'Cerr'),
help="Concentration parameters for priors for "
"``--method bayesian``. Priors are over preferences, "
"mutagenesis rate, and error rate(s).")
parser.add_argument('--pseudocount', default=1,
help="Pseudocount used with ``--method ratio``.")
return parser
[docs]def prefsParser():
"""Returns `argparse.ArgumentParser` for ``dms2_prefs``."""
parser = argparse.ArgumentParser(
description=parserDescription(
'Estimate preferences from mutation counts.'),
parents=[prefsParentParser()],
conflict_handler='resolve',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--pre', required=True,
help='Pre-selection counts file or prefix used when creating '
'this file.')
parser.add_argument('--post', required=True,
help='Like ``--pre`` but for post-selection counts.')
parser.add_argument('--name', required=True,
help='Name used for output files.')
parser.add_argument('--err', nargs=2, metavar=('ERRPRE', 'ERRPOST'),
help="Like ``--pre`` but for counts for error control(s) for "
"``--pre`` and ``--post``. Specify same file twice for same "
"control for both.")
return parser
[docs]def batch_prefsParser():
"""Returns `argparse.ArgumentParser` for ``dms2_batch_prefs``."""
parser = argparse.ArgumentParser(
description=parserDescription(
'Perform many runs of ``dms2_prefs`` and summarize results.'),
parents=[prefsParentParser()],
conflict_handler='resolve',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--batchfile', help="CSV file specifying each "
"``dms2_prefs`` run. Must have these columns: "
"`name`, `pre`, `post`. Can also have these columns: "
"`err` or `errpre` and `errpost`. Other columns are ignored, "
"so other ``dms2_prefs`` args should be passed as separate "
"command line args rather than in ``--batchfile``.",
required=True)
parser.add_argument('--summaryprefix', required=True,
help="Prefix of output summary files and plots.")
parser.set_defaults(no_corr=False)
parser.add_argument('--no_corr', dest='no_corr', action='store_true',
help='Do not create correlation plot.')
parser.set_defaults(no_avg=False)
parser.add_argument('--no_avg', dest='no_avg', action='store_true',
help='Do not create average prefs CSV.')
return parser
[docs]def diffselParentParser():
"""Parent parser for ``dms2_diffsel`` / ``dms2_batch_diffsel``."""
parser = argparse.ArgumentParser(
parents=[parentParser()],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--indir', help="Input counts files in this "
"directory.")
parser.add_argument('--chartype', default='codon_to_aa',
choices=['codon_to_aa'], help="Characters for which "
"differential selection is estimated. `codon_to_aa` = amino "
"acids from codon counts.")
parser.add_argument('--excludestop', default='yes', choices=['yes', 'no'],
help="Exclude stop codons as a possible amino acid?")
parser.add_argument('--pseudocount', default=5, type=float,
help="Pseudocount added to each count for sample with smaller "
"depth; pseudocount for other sample scaled by relative depth.")
parser.add_argument('--mincount', default=0, type=float,
help="Report as `NaN` the diffsel of mutations for which both "
"selected and mock-selected samples have < this many counts.")
return parser
[docs]def diffselParser():
"""Returns `argparse.ArgumentParser` for ``dms2_diffsel``."""
parser = argparse.ArgumentParser(
description=parserDescription(
'Estimate differential selection from mutation counts.'),
parents=[diffselParentParser()],
conflict_handler='resolve',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--name', required=True,
help='Name used for output files.')
parser.add_argument('--sel', required=True, help="Post-selection "
"counts file or prefix used when creating this file.")
parser.add_argument('--mock', required=True, help="Like ``--sel``, "
"but for mock-selection counts.")
parser.add_argument('--err', help="Like ``--sel`` but for "
"error-control to correct mutation counts.")
return parser
[docs]def batch_diffselParser():
"""Returns `argparse.ArgumentParser` for ``dms2_batch_diffsel``."""
parser = argparse.ArgumentParser(
description=parserDescription(
'Perform many runs of ``dms2_diffsel`` and summarize results.'),
parents=[diffselParentParser()],
conflict_handler='resolve',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--batchfile', required=True, help='CSV file '
'specifying each ``dms2_diffsel`` run. Must have these '
'columns: `name`, `sel`, `mock`. Can also have these: '
'`err`, `group`, `grouplabel`. If `group` is used, '
'samples are grouped in summary plots labeled by `group`, '
'or by `grouplabel` if specified. Other columns are ignored, '
'so other ``dms2_diffsel`` args should be passed as separate '
'command line args rather than in ``--batchfile``.')
parser.add_argument('--summaryprefix', required=True,
help='Prefix of output summary files and plots.')
return parser
[docs]def fracsurviveParser():
"""Returns `argparse.ArgumentParser` for ``dms2_fracsurvive``."""
parser = argparse.ArgumentParser(
description=parserDescription(
'Estimate fraction surviving for each mutation.'),
parents=[fracsurviveParentParser()],
conflict_handler='resolve',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--name', required=True,
help='Name used for output files.')
parser.add_argument('--sel', required=True, help="Post-selection "
"counts file or prefix used when creating this file.")
parser.add_argument('--mock', required=True, help="Like ``--sel``, "
"but for mock-selection counts.")
parser.add_argument('--libfracsurvive', required=True, type=float,
help='Overall fraction of total library surviving selection '
'versus mock condition. Should be between 0 and 1.')
parser.add_argument('--err', help="Like ``--sel`` but for "
"error-control to correct mutation counts.")
return parser
[docs]def batch_fracsurviveParser():
"""Returns `argparse.ArgumentParser` for ``dms2_batch_fracsurvive``"""
parser = argparse.ArgumentParser(
description=parserDescription(
'Perform runs of ``dms2_fracsurvive`` and summarize results.'),
parents=[fracsurviveParentParser()],
conflict_handler='resolve',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--batchfile', required=True, help='CSV file '
'specifying each ``dms2_fracsurvive`` run. Must have these '
'columns: `name`, `sel`, `mock`, `libfracsurvive`. Can also '
'have these `err`, `group`, `grouplabel`. If `group` is used, '
'samples are grouped in summary plots labeled by `group`, or by '
'`grouplabel` if it is specified. Other columns are ignored, so '
'other ``dms2_fracsurvive`` args should be passed as separate '
'command line args rather than in ``--batchfile``.')
parser.add_argument('--summaryprefix', required=True,
help='Prefix of output summary files and plots.')
return parser
[docs]def fracsurviveParentParser():
"""Parent parser ``dms2_fracsurvive`` / ``dms2_batch_fracsurvive``"""
parser = argparse.ArgumentParser(
parents=[parentParser()],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--indir', help="Input counts files in this "
"directory.")
parser.add_argument('--chartype', default='codon_to_aa',
choices=['codon_to_aa'], help="Characters for which "
"fraction surviving selection is estimated. `codon_to_aa` ="
" amino acids from codon counts.")
parser.add_argument('--aboveavg', default='no', choices=['yes', 'no'],
help="Report fracsurvive **above** the library average "
"rather than direct fracsurvive values.")
parser.add_argument('--excludestop', default='yes', choices=['yes', 'no'],
help="Exclude stop codons as a possible amino acid?")
parser.add_argument('--pseudocount', default=5, type=float,
help="Pseudocount added to each count for sample with smaller "
"depth; pseudocount for other sample scaled by relative depth.")
parser.add_argument('--mincount', default=0, type=float,
help="Report as `NaN` the fracsurvive of mutations for which "
"both selected and mock-selected samples have < this many counts.")
return parser
[docs]def logoplotParser():
"""Returns `argparse.ArgumentParser` for ``dms2_logoplot``."""
parser = argparse.ArgumentParser(
description=parserDescription(
'Create logo plot visualization.'),
parents=[parentParser()],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--prefs', help='CSV file of amino-acid preferences.')
group.add_argument('--diffsel', help='CSV file of amino-acid '
'differential selection.')
group.add_argument('--fracsurvive', help='CSV file of amino-acid '
'fraction surviving.')
group.add_argument('--diffprefs', help='CSV file of differences in '
'amino-acid preferences.')
group.add_argument('--muteffects', help='CSV file of amino-acid '
'mutational effects.')
parser.add_argument('--name', required=True,
help='Name used for output files.')
parser.add_argument('--nperline', help='Number of sites per line.',
type=int, default=70)
parser.add_argument('--numberevery', type=int, default=10,
help='Number sites at this interval.')
parser.add_argument('--excludestop', choices=['yes', 'no'],
default='no', help='Exclude stop codons as possible amino '
'acid?')
parser.add_argument('--stringency', type=float, default=1,
help='Stringency parameter to re-scale prefs.')
parser.add_argument('--restrictdiffsel', default='all',
choices=['all', 'positive', 'negative'],
help='Plot all diffsel, or only positive or negative.')
parser.add_argument('--diffselrange', nargs=2, type=float,
metavar=('MINDIFFSEL', 'MAXDIFFSEL'),
help='Specify a fixed range for `diffsel`. Otherwise '
'determined from data range.')
parser.add_argument('--muteffectrange', nargs=2, type=float,
metavar=('MINMUTEFFECT', 'MAXMUTEFFECT'),
help='Specify a fixed range for `muteffects`. Otherwise '
'determined from data range.')
parser.add_argument('--fracsurvivemax', type=float,
help='Specify maximum value for `fracsurvive`. '
'Otherwise determined from data range.')
parser.add_argument('--sortsites', choices=['yes', 'no'],
default='yes', help='Sort sites from first to last '
'before plotting.')
parser.add_argument('--mapmetric', default='functionalgroup', choices=[
'kd', 'mw', 'charge', 'functionalgroup', 'singlecolor'],
help='Color amino acids '
'by Kyte-Doolittle hydrophobicity, molecular weight, charge, '
'or functional group.')
parser.add_argument('--colormap', default='jet', help="`matplotlib "
"color map <http://matplotlib.org/users/colormaps.html>`_ for"
" amino acids when `--mapmetric` is 'kd' or 'mw'; name of "
"single color when it is 'singlecolor'.")
parser.add_argument('--overlay1', nargs=3,
metavar=('FILE', 'SHORTNAME', 'LONGNAME'),
help="Color bar above logo plot to denote per-residue "
"property. FILE is CSV format with column names `site` "
"and SHORTNAME. SHORTNAME is <= 5 character property name. "
"LONGNAME is longer name for legend. Sites not in FILE "
"are colored white. To show wildtype identity, make "
"SHORTNAME and LONGNAME both `wildtype` and have this "
"column in FILE give 1-letter wildtype amino-acid code. "
"To show ``omegabysite.txt`` file from ``phydms``, "
"give that file and set both SHORTNAME and LONGNAME "
"to `omegabysite`.")
parser.add_argument('--overlay2', default=None, nargs=3,
metavar=('FILE', 'SHORTNAME', 'LONGNAME'),
help='Second overlay color bar.')
parser.add_argument('--overlay3', default=None, nargs=3,
metavar=('FILE', 'SHORTNAME', 'LONGNAME'),
help='Third overlay color bar.')
parser.add_argument('--underlay', default='no', choices=['yes', 'no'],
help='Plot underlay rather than overlay bars.')
parser.add_argument('--scalebar', default=None, nargs=2,
metavar=('BARHEIGHT', 'LABEL'), help='Plot a scale bar '
'indicating BARHEIGHT with LABEL. Only for `diffsel`, '
'`fracsurvive`, and `muteffects`.')
parser.add_argument('--overlaycolormap', default='jet', help="`matplotlib "
"color map <http://matplotlib.org/users/colormaps.html>`_ for"
" overlay bars (e.g., 'jet' or 'YlOrRd').")
parser.add_argument('--letterheight', type=float, default=1,
help="Relative height of letter stacks in logo plot.")
parser.add_argument('--ignore_extracols', default='no',
choices=['yes', 'no'], help='Ignore extra columns in data')
parser.add_argument('--sepline', default='yes', choices=['yes', 'no'],
help="Separate positive and negative diffsel with black line?")
return parser
if __name__ == '__main__':
import doctest
doctest.testmod()