Source code for dms_tools2.dssp
"""
===========
dssp
===========
Process output from `dssp <http://swift.cmbi.ru.nl/gv/dssp/>`_.
`dssp <http://swift.cmbi.ru.nl/gv/dssp/>`_ can be used to calculate
secondary structure and solvent accessibility information from a
PDB structure. This module can process that output.
"""
import os
import re
import pandas
import Bio.PDB
#: max accessible surface area (square angstroms) for amino acids, from
#: `Tien et al (2013) <https://doi.org/10.1371/journal.pone.0080635>`_.
#: `MAX_ASA_TIEN[a]` is the max surface area for amino acid `a`.
MAX_ASA_TIEN = {'A':129.0, 'R':274.0, 'N':195.0, 'D':193.0, 'C':167.0,
'E':223.0, 'Q':225.0, 'G':104.0, 'H':224.0, 'I':197.0,
'L':201.0, 'K':236.0, 'M':224.0, 'F':240.0, 'P':159.0,
'S':155.0, 'T':172.0, 'W':285.0, 'Y':263.0, 'V':174.0}
[docs]def processDSSP(dsspfile, chain=None, max_asa=MAX_ASA_TIEN):
"""Get secondary structure and solvent accessibility from ``dssp``.
`dssp <http://swift.cmbi.ru.nl/gv/dssp/>`_ is a program
that calculates secondary structure and absolute solvent
accessibility from a PDB file.
This function processes the text output provided by the
`dssp webserver <http://swift.cmbi.ru.nl/gv/dssp/>`_, at
least given the format of that output as of Sept-4-2017.
It returns a `pandas.DataFrame` that gives the secondary
structure and solvent accessibility for each residue in the
``dssp`` output.
Args:
`dsspfile` (str)
Name of text file containing ``dssp`` output.
`chain` (str or `None`)
If the PDB file analyzed by ``dssp`` to create `dsspfile`
has more than one chain, specify the letter code for one
of those chains with this argument.
`max_asa` (dict)
Max surface area for each amino acid in square angstroms.
Returns:
A `pandas.DataFrame` with the following columns:
- `site`: residue number for all sites in `dsspfile`.
- `amino_acid`: amino acid identity of site in `dsspfile`.
- `ASA`: absolute solvent accessibility of the residue.
- `RSA`: relative solvent accessibility of the residue.
- `SS`: ``dssp`` secondary structure code, one of:
- *G*: 3-10 helix
- *H*: alpha helix
- *I*: pi helix
- *B*: beta bridge
- *E*: beta bulge
- *T*: turn
- *S*: high curvature
- *-*: loop
- `SS_class`: broader secondary structure class:
- *helix*: `SS` value of *G*, *H*, or *I*
- *strand*: `SS` value of *B* or *E*
- *loop*: any of the other `SS` values.
"""
dssp_cys = re.compile('[a-z]')
d_dssp = Bio.PDB.make_dssp_dict(dsspfile)[0]
chains = set([chainid for (chainid, r) in d_dssp.keys()])
if chain is None:
assert len(chains) == 1, "chain is None, but multiple chains"
chain = list(chains)[0]
elif chain not in chains:
raise ValueError("Invalid chain {0}".format(chain))
d_df = {'site':[],
'amino_acid':[],
'ASA':[],
'RSA':[],
'SS':[],
'SS_class':[],
}
for ((chainid, r), tup) in d_dssp.items():
if chainid == chain:
(tmp_aa, ss, acc) = tup[ : 3]
if dssp_cys.match(tmp_aa):
aa = 'C'
else:
aa = tmp_aa
if r[2] and not r[2].isspace():
# site has letter suffix
d_df['site'].append(str(r[1]) + r[2].strip())
else:
d_df['site'].append(r[1])
d_df['amino_acid'].append(aa)
d_df['ASA'].append(acc)
d_df['RSA'].append(acc / float(max_asa[aa]))
d_df['SS'].append(ss)
if ss in ['G', 'H', 'I']:
d_df['SS_class'].append('helix')
elif ss in ['B', 'E']:
d_df['SS_class'].append('strand')
elif ss in ['T', 'S', '-']:
d_df['SS_class'].append('loop')
else:
raise ValueError("invalid SS of {0}".format(ss))
return pandas.DataFrame(d_df)
if __name__ == '__main__':
import doctest
doctest.testmod()