Source code for dms_tools2.protstruct

"""
===========
protstruct
===========
Module for operations related to protein structures.
"""

import os
import itertools
import tempfile
import collections

import natsort
import numpy
import Bio.PDB


[docs]def atomDist(a1, a2): """Calculates distance between two atoms. Args: `a1`, `a2` (`Bio.PDB.Atom.Atom` objects) The two atoms. Returns: The distance between the two atoms. >>> Atom = Bio.PDB.Atom.Atom >>> a1 = Atom(name='CA', coord=(1.0, 2.0, 3.0), bfactor=1, occupancy=1, ... altloc='', fullname='CA', serial_number=1, element='C') >>> a2 = Atom(name='CA', coord=(1.5, 3.0, 4.5), bfactor=1, occupancy=1, ... altloc='', fullname='CA', serial_number=2, element='C') >>> round(atomDist(a1, a2), 3) 1.871 """ diff_vector = numpy.array(a1.coord) - numpy.array(a2.coord) return numpy.sqrt(numpy.sum(diff_vector * diff_vector))
[docs]def residueDist(r1, r2, atom=None): """Calculates distance between closest atoms in two residues. Args: `r1`, `r2` (`Bio.PDB.Residue.Residue` objects) The two residues. `atom` (`None` or str) If `None`, computes closest distance between any atoms. If a string, only computes closest distance among atoms of this type (e.g., `CA` for alpha carbon distances). Returns: Distance between closest atoms in the two residues. >>> Residue = Bio.PDB.Residue.Residue >>> Atom = Bio.PDB.Atom.Atom >>> r1 = Residue('r1', 'r1', 'A') >>> a1 = Atom(name='CA', coord=(1.0, 2.0, 2.5), bfactor=1, occupancy=1, ... altloc='', fullname='CA', serial_number=1, element='C') >>> a2 = Atom(name='N', coord=(1.5, 3.0, 4.5), bfactor=1, occupancy=1, ... altloc='', fullname='N', serial_number=2, element='N') >>> r1.add(a1) >>> r1.add(a2) >>> r2 = Residue('r2', 'r2', 'A') >>> a3 = Atom(name='CA', coord=(2.0, 4.0, 6.0), bfactor=1, occupancy=1, ... altloc='', fullname='CA', serial_number=1, element='C') >>> r2.add(a3) >>> round(residueDist(r1, r2), 3) == 1.871 True >>> round(residueDist(r1, r2, atom='CA'), 3) == 4.153 True """ if atom is None: d = min([atomDist(a1, a2) for (a1, a2) in itertools.product(r1, r2)]) else: d = atomDist(r1[atom], r2[atom]) return d
[docs]def distMatrix(pdbfile, chains, dist_type, equivchains={}, ignore_hetero=True): """Residue-residue distance matrix for protein or homo-oligomer. Args: `pdbfile` (str) Name of PDB file. `chains` (str or list of str) Chain in `pdbfile` for which we compute residue distances, or list of chains if the residues span several chains (as for proteins like HA which are cleaved to subunits). `dist_type` (str) Distances to measure. Use `CA` for alpha carbon distances, and `any` for nearest distances of any atom in residues. `equivchains` (dict) If the structure is a homo-oligomer, each chain may have other equivalent chains. In this case, for each `chain`, `equivchains[chain]` should be list of equivalent chains that we also include in the distance calculations. `ignorehetero` (bool) Ignore hetero-residues, and only consider protein ones. Returns: The 2-tuple `(residues, distmatrix)` where `residues` is a list of the residue names (as strings) and `distmatrix` is a `numpy.ndarray` with element `distmatrix[i, j]` giving the distance between residue `residues[i]` and `residues[j]`. Here is an example computation of distances between two residues spanning two chains: >>> with tempfile.NamedTemporaryFile(mode='w') as f: ... n = f.write( ... '\\n'.join([ ... 'ATOM 4633 N VAL X 505A 57.621 44.297 43.089 1.00 96.43 N', ... 'ATOM 4634 CA VAL X 505A 58.278 45.594 43.147 1.00 84.49 C', ... 'ATOM 4635 C VAL X 505A 59.779 45.434 42.943 1.00 82.23 C', ... 'ATOM 4636 O VAL X 505A 60.339 44.372 43.218 1.00 91.07 O', ... 'ATOM 4637 CB VAL X 505A 58.010 46.304 44.493 1.00 98.34 C', ... 'ATOM 4638 CG1 VAL X 505A 58.732 47.644 44.544 1.00 94.96 C', ... 'ATOM 4639 CG2 VAL X 505A 56.511 46.483 44.713 1.00 92.78 C', ... 'ATOM 1 N VAL A 518 46.814 16.139 29.171 1.00 92.74 N', ... 'ATOM 2 CA VAL A 518 46.514 15.640 27.833 1.00 99.45 C', ... 'ATOM 3 C VAL A 518 47.047 16.605 26.764 1.00102.76 C', ... 'ATOM 4 O VAL A 518 47.281 17.785 27.034 1.00 86.02 O', ... 'ATOM 5 CB VAL A 518 44.993 15.424 27.640 1.00 95.35 C', ... 'ATOM 6 CG1 VAL A 518 44.729 14.428 26.515 1.00 70.29 C', ... 'ATOM 7 CG2 VAL A 518 44.357 14.931 28.935 1.00 97.65 C', ... 'ATOM 8 CA VAL Y 505A 47.514 16.640 28.833 1.00 99.45 C', ... ])) ... f.flush() ... (residues, ca_dist) = distMatrix(f.name, ['A', 'X'], 'CA') ... (residues, any_dist) = distMatrix(f.name, ['A', 'X'], 'any') ... (residues, equiv_dist) = distMatrix(f.name, ['A', 'X'], 'CA', {'X':['Y']}) >>> residues ['505A', '518'] >>> numpy.allclose(ca_dist, numpy.array( ... [[0, 35.639], [35.639, 0]]), atol=1e-3) True >>> numpy.allclose(any_dist, numpy.array( ... [[0, 32.674], [32.674, 0]]), atol=1e-3) True >>> numpy.allclose(equiv_dist, numpy.array( ... [[0, 1.732], [1.732, 0]]), atol=1e-3) True """ if isinstance(chains, str): chains = list(chains) # read PDB and take first model (typically there is only one unless NMR) structure = Bio.PDB.PDBParser().get_structure('id', pdbfile) model = structure[0] # lists residue objects res_objs = [] # keyed by residue objects, lists equivalents in other chains equiv_res_objs = collections.defaultdict(list) for chain in chains: for res in model[chain]: res_id = res.get_id() if res_id[0].strip() and ignore_hetero: continue res_objs.append(res) if equivchains and chain in equivchains: for otherchain in equivchains[chain]: equiv_res_objs[res] += [res2 for res2 in model[otherchain] if res2.get_id() == res_id] # get residue names and sort residues = ['{0}{1}'.format(n, i).strip() for (het, n, i) in map(lambda x: x.get_id(), res_objs)] assert len(residues) == len(set(residues)), "duplicate residue names" sortindex = natsort.index_natsorted(residues) residues = natsort.order_by_index(residues, sortindex) res_objs = natsort.order_by_index(res_objs, sortindex) # build up distance matrix if dist_type == 'any': atom = None elif dist_type == 'CA': atom = 'CA' else: raise ValueError("invalid dist_type: {0}".format(dist_type)) distmatrix = numpy.zeros((len(residues), len(residues))) for ((i, ri), (j, rj)) in itertools.combinations(enumerate(res_objs), 2): dist = min( [residueDist(ri, rj, atom=atom)] + [residueDist(ri, rk, atom=atom) for rk in equiv_res_objs[rj]] + [residueDist(rk, rj, atom=atom) for rk in equiv_res_objs[ri]] ) distmatrix[i, j] = distmatrix[j, i] = dist return (residues, distmatrix)
if __name__ == '__main__': import doctest doctest.testmod()