Source code for dms_tools2.protstruct

"""
===========
protstruct
===========
Module for operations related to protein structures.
"""

import os
import itertools
import tempfile
import collections

import natsort
import numpy
import Bio.PDB


[docs]def atomDist(a1, a2):
    """Calculates distance between two atoms.

    Args:
        `a1`, `a2` (`Bio.PDB.Atom.Atom` objects)
            The two atoms.

    Returns:
        The distance between the two atoms.

    >>> Atom = Bio.PDB.Atom.Atom
    >>> a1 = Atom(name='CA', coord=(1.0, 2.0, 3.0), bfactor=1, occupancy=1,
    ...           altloc='', fullname='CA', serial_number=1, element='C')
    >>> a2 = Atom(name='CA', coord=(1.5, 3.0, 4.5), bfactor=1, occupancy=1,
    ...           altloc='', fullname='CA', serial_number=2, element='C')
    >>> round(atomDist(a1, a2), 3)
    1.871
    """
    diff_vector = numpy.array(a1.coord) - numpy.array(a2.coord)
    return numpy.sqrt(numpy.sum(diff_vector * diff_vector))


[docs]def residueDist(r1, r2, atom=None):
    """Calculates distance between closest atoms in two residues.

    Args:
        `r1`, `r2` (`Bio.PDB.Residue.Residue` objects)
            The two residues.
        `atom` (`None` or str)
            If `None`, computes closest distance between any atoms.
            If a string, only computes closest distance among atoms
            of this type (e.g., `CA` for alpha carbon distances).

    Returns:
        Distance between closest atoms in the two residues.

    >>> Residue = Bio.PDB.Residue.Residue
    >>> Atom = Bio.PDB.Atom.Atom
    >>> r1 = Residue('r1', 'r1', 'A')
    >>> a1 = Atom(name='CA', coord=(1.0, 2.0, 2.5), bfactor=1, occupancy=1,
    ...           altloc='', fullname='CA', serial_number=1, element='C')
    >>> a2 = Atom(name='N', coord=(1.5, 3.0, 4.5), bfactor=1, occupancy=1,
    ...           altloc='', fullname='N', serial_number=2, element='N')
    >>> r1.add(a1)
    >>> r1.add(a2)
    >>> r2 = Residue('r2', 'r2', 'A')
    >>> a3 = Atom(name='CA', coord=(2.0, 4.0, 6.0), bfactor=1, occupancy=1,
    ...           altloc='', fullname='CA', serial_number=1, element='C')
    >>> r2.add(a3)
    >>> round(residueDist(r1, r2), 3) == 1.871
    True
    >>> round(residueDist(r1, r2, atom='CA'), 3) == 4.153
    True
    """
    if atom is None:
        d = min([atomDist(a1, a2) for (a1, a2) in itertools.product(r1, r2)])
    else:
        d = atomDist(r1[atom], r2[atom])
    return d


[docs]def distMatrix(pdbfile, chains, dist_type, equivchains={}, ignore_hetero=True):
    """Residue-residue distance matrix for protein or homo-oligomer.

    Args:
        `pdbfile` (str)
            Name of PDB file.
        `chains` (str or list of str)
            Chain in `pdbfile` for which we compute residue distances,
            or list of chains if the residues span several chains 
            (as for proteins like HA which are cleaved to subunits).
        `dist_type` (str)
            Distances to measure. Use `CA` for alpha carbon distances,
            and `any` for nearest distances of any atom in residues.
        `equivchains` (dict)
            If the structure is a homo-oligomer, each chain may
            have other equivalent chains. In this case, for each
            `chain`, `equivchains[chain]` should be list of equivalent
            chains that we also include in the distance calculations.
        `ignorehetero` (bool)
            Ignore hetero-residues, and only consider protein ones.    

    Returns:
        The 2-tuple `(residues, distmatrix)` where `residues` is a
        list of the residue names (as strings) and `distmatrix` is a 
        `numpy.ndarray` with element `distmatrix[i, j]` giving
        the distance between residue `residues[i]` and `residues[j]`.

    Here is an example computation of distances between two residues
    spanning two chains:

    >>> with tempfile.NamedTemporaryFile(mode='w') as f:
    ...     n = f.write(
    ...      '\\n'.join([
    ...      'ATOM   4633  N   VAL X 505A     57.621  44.297  43.089  1.00 96.43           N',
    ...      'ATOM   4634  CA  VAL X 505A     58.278  45.594  43.147  1.00 84.49           C',
    ...      'ATOM   4635  C   VAL X 505A     59.779  45.434  42.943  1.00 82.23           C',
    ...      'ATOM   4636  O   VAL X 505A     60.339  44.372  43.218  1.00 91.07           O',
    ...      'ATOM   4637  CB  VAL X 505A     58.010  46.304  44.493  1.00 98.34           C',
    ...      'ATOM   4638  CG1 VAL X 505A     58.732  47.644  44.544  1.00 94.96           C',
    ...      'ATOM   4639  CG2 VAL X 505A     56.511  46.483  44.713  1.00 92.78           C',
    ...      'ATOM      1  N   VAL A 518      46.814  16.139  29.171  1.00 92.74           N',
    ...      'ATOM      2  CA  VAL A 518      46.514  15.640  27.833  1.00 99.45           C',
    ...      'ATOM      3  C   VAL A 518      47.047  16.605  26.764  1.00102.76           C',
    ...      'ATOM      4  O   VAL A 518      47.281  17.785  27.034  1.00 86.02           O',
    ...      'ATOM      5  CB  VAL A 518      44.993  15.424  27.640  1.00 95.35           C',
    ...      'ATOM      6  CG1 VAL A 518      44.729  14.428  26.515  1.00 70.29           C',
    ...      'ATOM      7  CG2 VAL A 518      44.357  14.931  28.935  1.00 97.65           C',
    ...      'ATOM      8  CA  VAL Y 505A     47.514  16.640  28.833  1.00 99.45           C',
    ...      ]))
    ...     f.flush()
    ...     (residues, ca_dist) = distMatrix(f.name, ['A', 'X'], 'CA')
    ...     (residues, any_dist) = distMatrix(f.name, ['A', 'X'], 'any')
    ...     (residues, equiv_dist) = distMatrix(f.name, ['A', 'X'], 'CA', {'X':['Y']})
    >>> residues
    ['505A', '518']
    >>> numpy.allclose(ca_dist, numpy.array(
    ...    [[0, 35.639], [35.639, 0]]), atol=1e-3)
    True
    >>> numpy.allclose(any_dist, numpy.array(
    ...    [[0, 32.674], [32.674, 0]]), atol=1e-3)
    True
    >>> numpy.allclose(equiv_dist, numpy.array(
    ...    [[0, 1.732], [1.732, 0]]), atol=1e-3)
    True
    """
    if isinstance(chains, str):
        chains = list(chains)

    # read PDB and take first model (typically there is only one unless NMR)
    structure = Bio.PDB.PDBParser().get_structure('id', pdbfile)
    model = structure[0]

    # lists residue objects 
    res_objs = []
    # keyed by residue objects, lists equivalents in other chains
    equiv_res_objs = collections.defaultdict(list) 
    for chain in chains:
        for res in model[chain]:
            res_id = res.get_id()
            if res_id[0].strip() and ignore_hetero:
                continue
            res_objs.append(res)
            if equivchains and chain in equivchains:
                for otherchain in equivchains[chain]:
                    equiv_res_objs[res] += [res2 for res2 in
                            model[otherchain] if res2.get_id() == res_id]

    # get residue names and sort 
    residues = ['{0}{1}'.format(n, i).strip() for (het, n, i) in
            map(lambda x: x.get_id(), res_objs)]
    assert len(residues) == len(set(residues)), "duplicate residue names"
    sortindex = natsort.index_natsorted(residues)
    residues = natsort.order_by_index(residues, sortindex)
    res_objs = natsort.order_by_index(res_objs, sortindex)

    # build up distance matrix
    if dist_type == 'any':
        atom = None
    elif dist_type == 'CA':
        atom = 'CA'
    else:
        raise ValueError("invalid dist_type: {0}".format(dist_type))
    distmatrix = numpy.zeros((len(residues), len(residues)))
    for ((i, ri), (j, rj)) in itertools.combinations(enumerate(res_objs), 2):
        dist = min(
                [residueDist(ri, rj, atom=atom)] +
                [residueDist(ri, rk, atom=atom) for rk in equiv_res_objs[rj]] +
                [residueDist(rk, rj, atom=atom) for rk in equiv_res_objs[ri]]
                )
        distmatrix[i, j] = distmatrix[j, i] = dist

    return (residues, distmatrix)



if __name__ == '__main__':
    import doctest
    doctest.testmod()
Source code for dms_tools2.protstruct

dms_tools2

Navigation

Related Topics