Source code for alignparse.minimap2

"""
===============
minimap2
===============

Runs `minimap2 <https://lh3.github.io/minimap2/>`_ aligner.
``minimap2`` must be version 2.17 or higher.

"""

import contextlib
import os
import re
import subprocess
import tempfile
import textwrap  # noqa: F401

import packaging.version

import pysam


OPTIONS_CODON_DMS = (
    "-A2",
    "-B4",
    "-O12",
    "-E2",
    "--end-bonus=13",
    "--secondary=no",
    "--cs",
)
"""tuple: ``minimap2`` options for codon-mutant libraries.

Note
----
These options work well for sequences expected to have codon-level mutations
(3 nucleotides in a row) and not expected to have many indels.

Options have the following meaning / rationale:
  - ``-A2`` : score for a match
  - ``-B4`` : penalty for a mismatch
  - ``-O12`` : gap opening penalty (high, because few indels expected)
  - ``-E2`` : gap extension penalty
  - ``--end-bonus=13`` : favor codon mutation over gap at termini.
  - ``--secondary=no`` : do not output secondary alignments.
  - ``--cs`` : add short ``cs`` tag ((see https://github.com/lh3/minimap2#cs)

"""

OPTIONS_VIRUS_W_DEL = (
    "-xsplice:hq",
    "-un",
    "-C0",
    "--splice-flank=no",
    "-M=1",
    "--end-seed-pen=2",
    "--end-bonus=2",
    "--secondary=no",
    "--cs",
)
"""tuple: ``minimap2`` options for viral genes.

Note
----
These options work well for sequences expected to have nucleotide mutations
and long internal deletions. It handles long deletions by having ``minimap2``
treat them like introns. This means that in the output, introns should be
parsed like deletions.

Options have the following meaning / rationale:
  - ``-xsplice:hq`` : preset for long-read high-quality spliced alignment.
  - ``-un`` : do not try to find canonical splice sites.
  - ``-C0`` : no cost for non-canonical splice sites.
  - ``--splice-flank=no`` : no assumptions about base next to splice donor.
  - ``-M=1`` : mark secondary chain that overlaps with another by this much.
  - ``--end-seed-pen=2`` : described as helping avoid tiny terminal exons.
  - ``--end-bonus=2`` : bonus for extending to end of alignment.
  - ``--secondary=no`` : do not output secondary alignments.
  - ``--cs`` : add short ``cs`` tag ((see https://github.com/lh3/minimap2#cs)

"""


[docs]class Mapper: r"""Run `minimap2 <https://lh3.github.io/minimap2/>`_. Parameters ---------- options : tuple or list Command line options to ``minimap2``. For instance, see :data:`OPTIONS_CODON_DMS` or :data:`OPTIONS_VIRUS_W_DEL`. prog : str Path to ``minimap2`` executable. min_version : str Minimum version of ``minimap2``. :class:`Mapper` has only been tested with versions >= 2.17. check_cs : bool Check that `options` includes ``--cs`` to output the short ``cs`` tag. retain_tags : None or list Retain these tags in query sequences in output alignments. Attributes ---------- options : list Options to ``minimap2``. prog : str Path to ``minimap2`` executable. version : str Version of ``minimap2``. retain_tags : None or list List of tags to retain. Note ----- If `retain_tags` is set, then :meth:`Mapper.map_to_sam` searches for tags in query FASTQ headers like the `np`` tags here:: @m54228_181120_212724/4194377/ccs np:i:45 and retains them in the `samfile` alignment output. Such tags are present in FASTQ files created by PacBio ``ccs`` version 4.0. For the above header, you'd use `retain_tags=['np']`. Examples -------- First, create toy example target and query file: >>> tempdir = tempfile.TemporaryDirectory() >>> targetfile = os.path.join(tempdir.name, 'targets.fasta') >>> queryfile = os.path.join(tempdir.name, 'queries.fastq') >>> with open(targetfile, 'w') as f: ... _ = f.write(textwrap.dedent(''' ... >refseq ... ATGCAANNNGATGCATAGTATTAGCATAAATAGGATAGCCATAAGGTTACTGCATAAGGGTAT ... ''').strip()) >>> with open(queryfile, 'w') as f: ... _ = f.write(textwrap.dedent(''' ... @m54228_181120_212724/4194376/ccs np:i:127 ... ATGCAAAATGATGCATAGTATTAGCATAAATAGGATAGCCATAAGGTTACTGCATAAGAGTAT ... + ... ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ... ''').strip()) Now map **not** retaining tags from the FASTQ (this is default behavior): >>> samfile = os.path.join(tempdir.name, 'alignments.sam') >>> mapper = Mapper(OPTIONS_CODON_DMS) >>> mapper.map_to_sam(targetfile, queryfile, samfile) >>> for a in pysam.AlignmentFile(samfile): ... tag_names = [tup[0] for tup in a.get_tags()] ... print(a.tostring()) # doctest: +NORMALIZE_WHITESPACE m54228_181120_212724/4194376/ccs 0 refseq 1 1 63M * 0 0 ATGCAAAATGATGCATAGTATTAGCATAAATAGGATAGCCATAAGGTTACTGCATAAGAGTAT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NM:i:4 ms:i:111 AS:i:111 nn:i:3 tp:A:P cm:i:7 s1:i:41 s2:i:0 de:f:0.0167 cs:Z::6*na*na*nt:49*ga:4 rl:i:0 >>> print(tag_names) ['NM', 'ms', 'AS', 'nn', 'tp', 'cm', 's1', 's2', 'de', 'cs', 'rl'] Now map retaining the `np` tag in the FASTQ: >>> samfile_tags = os.path.join(tempdir.name, 'alignments_tags.sam') >>> mapper_tags = Mapper(OPTIONS_CODON_DMS, retain_tags=['np']) >>> mapper_tags.map_to_sam(targetfile, queryfile, samfile_tags) >>> for a in pysam.AlignmentFile(samfile_tags): ... tag_names = [tup[0] for tup in a.get_tags()] ... print(a.tostring()) # doctest: +NORMALIZE_WHITESPACE m54228_181120_212724/4194376/ccs 0 refseq 1 1 63M * 0 0 ATGCAAAATGATGCATAGTATTAGCATAAATAGGATAGCCATAAGGTTACTGCATAAGAGTAT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NM:i:4 ms:i:111 AS:i:111 nn:i:3 tp:A:P cm:i:7 s1:i:41 s2:i:0 de:f:0.0167 cs:Z::6*na*na*nt:49*ga:4 rl:i:0 np:i:127 >>> print(tag_names) # doctest: +NORMALIZE_WHITESPACE ['NM', 'ms', 'AS', 'nn', 'tp', 'cm', 's1', 's2', 'de', 'cs', 'rl', 'np'] Remove the temporary directory for the example: >>> tempdir.cleanup() """ def __init__( self, options, *, prog="minimap2", min_version="2.17", check_cs=True, retain_tags=None, ): """See main :class:`Mapper` doc string.""" try: version = subprocess.check_output([prog, "--version"]) except Exception: raise ValueError(f"Can't execute `prog` {prog}. Is it installed?") self.version = version.strip().decode("utf-8") min_version = packaging.version.parse(min_version) if packaging.version.parse(self.version) < min_version: raise ValueError( f"You have `minimap2` version {self.version}, " f"but need >= {min_version}" ) self.prog = prog self.options = list(options) if check_cs and not any("--cs" == opt for opt in self.options): raise ValueError(f"`options` do not include `--cs`:\n{options}") if retain_tags: if isinstance(retain_tags, str): self.retain_tags = [retain_tags] else: self.retain_tags = list(retain_tags) else: self.retain_tags = None
[docs] def map_to_sam(self, targetfile, queryfile, samfile): """Map query sequences to targets and output SAM file. Parameters ---------- targetfile : str Alignment targets (reference), typically FASTA (can be gzipped). queryfile : str Queries to align, typically FASTA or FASTQ (can be gzipped). samfile : str Name of created SAM file (should have suffix ``.sam``). """ for fname, f in [("target", targetfile), ("query", queryfile)]: if not os.path.isfile(f): raise IOError(f"cannot find `{fname}file` {f}") if os.path.splitext(samfile)[1] != ".sam": raise ValueError(f"`samfile` lacks extension '.sam': {samfile}") if not any("-a" == opt for opt in self.options): options = self.options + ["-a"] cmds = [self.prog] + options + [targetfile, queryfile] with tempfile.TemporaryDirectory() as tempdir: # if retaining tags, we initially align to temporary file # then add tags to final alignment file if self.retain_tags: untagged_samfile = os.path.join(tempdir, os.path.basename(samfile)) else: untagged_samfile = samfile # do the alignment with contextlib.ExitStack() as stack: sam = stack.enter_context(open(untagged_samfile, "w")) err = stack.enter_context(tempfile.TemporaryFile(mode="w")) try: _ = subprocess.check_call(cmds, stdout=sam, stderr=err) except Exception as exc: if os.path.isfile(untagged_samfile): os.remove(untagged_samfile) exc_type = type(exc).__name__ err.seek(0) raise RuntimeError( f"Error running commands:\n{cmds}\n\n" f"Exception:\n{exc_type}: {exc}\n\n" f"Error message:\n{err.read()}\n\n" ) # if needed, add retained tags from queries FASTQ file if self.retain_tags: matchers = { tag: re.compile( r"(?:^|\s)" # start or space rf"{tag}:(?P<valtype>\w):(?P<val>\S+)" r"(?:\s|$)" # end or space ) for tag in self.retain_tags } with contextlib.ExitStack() as tagstack: untagged_sam = tagstack.enter_context( pysam.AlignmentFile(untagged_samfile) ) tagged_sam = tagstack.enter_context( pysam.AlignmentFile(samfile, mode="w", template=untagged_sam) ) queries = tagstack.enter_context(pysam.FastxFile(queryfile)) query = next(queries) for a in untagged_sam: name = a.query_name try: while query.name != name: query = next(queries) if self.retain_tags and not query.comment: raise ValueError( "specified `retain_tags` but " "no tags in `queryfile` " f"{queryfile}" ) for tag in self.retain_tags: m = matchers[tag].search(query.comment) if not m: raise ValueError(f"no tag {tag}:\n{query}") valtype = m.group("valtype") if valtype == "i": val = int(m.group("val")) elif valtype == "f": val = float(m.group("val")) elif valtype in {"A", "Z"}: val = m.group("val") else: raise ValueError( f"bad tag type {valtype} " f"for tag {tag}" ) a.set_tag(tag, val, valtype) except StopIteration: raise ValueError( f"No entry in {queryfile} for " f"query {name}\nMaybe alignment " f"is not sorted in query order?" ) tagged_sam.write(a) assert os.path.isfile(samfile)
if __name__ == "__main__": import doctest doctest.testmod()