Source code for dms_tools2.dssp

"""
===========
dssp
===========
Process output from `dssp <http://swift.cmbi.ru.nl/gv/dssp/>`_.

`dssp <http://swift.cmbi.ru.nl/gv/dssp/>`_ can be used to calculate
secondary structure and solvent accessibility information from a
PDB structure. This module can process that output.
"""

import os
import re
import pandas
import Bio.PDB


#: max accessible surface area (square angstroms) for amino acids, from
#: `Tien et al (2013) <https://doi.org/10.1371/journal.pone.0080635>`_.
#: `MAX_ASA_TIEN[a]` is the max surface area for amino acid `a`.
MAX_ASA_TIEN = {'A':129.0, 'R':274.0, 'N':195.0, 'D':193.0, 'C':167.0,
                'E':223.0, 'Q':225.0, 'G':104.0, 'H':224.0, 'I':197.0,
                'L':201.0, 'K':236.0, 'M':224.0, 'F':240.0, 'P':159.0,
                'S':155.0, 'T':172.0, 'W':285.0, 'Y':263.0, 'V':174.0}


[docs]def processDSSP(dsspfile, chain=None, max_asa=MAX_ASA_TIEN):
    """Get secondary structure and solvent accessibility from ``dssp``.

    `dssp <http://swift.cmbi.ru.nl/gv/dssp/>`_ is a program
    that calculates secondary structure and absolute solvent
    accessibility from a PDB file.

    This function processes the text output provided by the
    `dssp webserver <http://swift.cmbi.ru.nl/gv/dssp/>`_, at
    least given the format of that output as of Sept-4-2017.

    It returns a `pandas.DataFrame` that gives the secondary
    structure and solvent accessibility for each residue in the
    ``dssp`` output.

    Args:
        `dsspfile` (str)
            Name of text file containing ``dssp`` output.

        `chain` (str or `None`)
            If the PDB file analyzed by ``dssp`` to create `dsspfile`
            has more than one chain, specify the letter code for one
            of those chains with this argument.

        `max_asa` (dict)
            Max surface area for each amino acid in square angstroms.

    Returns:
        A `pandas.DataFrame` with the following columns:

            - `site`: residue number for all sites in `dsspfile`.

            - `amino_acid`: amino acid identity of site in `dsspfile`.

            - `ASA`: absolute solvent accessibility of the residue.

            - `RSA`: relative solvent accessibility of the residue.

            - `SS`: ``dssp`` secondary structure code, one of:

                - *G*: 3-10 helix

                - *H*: alpha helix

                - *I*: pi helix

                - *B*: beta bridge

                - *E*: beta bulge

                - *T*: turn

                - *S*: high curvature

                - *-*: loop

            - `SS_class`: broader secondary structure class:

                - *helix*: `SS` value of *G*, *H*, or *I*

                - *strand*: `SS` value of *B* or *E*

                - *loop*: any of the other `SS` values.
    """
    dssp_cys = re.compile('[a-z]')
    d_dssp = Bio.PDB.make_dssp_dict(dsspfile)[0]
    chains = set([chainid for (chainid, r) in d_dssp.keys()])
    if chain is None:
        assert len(chains) == 1, "chain is None, but multiple chains"
        chain = list(chains)[0]
    elif chain not in chains:
        raise ValueError("Invalid chain {0}".format(chain))
    d_df = {'site':[],
            'amino_acid':[],
            'ASA':[],
            'RSA':[],
            'SS':[],
            'SS_class':[],
            }
    for ((chainid, r), tup) in d_dssp.items():
        if chainid == chain:
            (tmp_aa, ss, acc) = tup[ : 3]
            if dssp_cys.match(tmp_aa):
                aa = 'C'
            else:
                aa = tmp_aa
            if r[2] and not r[2].isspace():
                # site has letter suffix
                d_df['site'].append(str(r[1]) + r[2].strip())
            else:
                d_df['site'].append(r[1])
            d_df['amino_acid'].append(aa)
            d_df['ASA'].append(acc)
            d_df['RSA'].append(acc / float(max_asa[aa]))
            d_df['SS'].append(ss)
            if ss in ['G', 'H', 'I']:
                d_df['SS_class'].append('helix')
            elif ss in ['B', 'E']:
                d_df['SS_class'].append('strand')
            elif ss in ['T', 'S', '-']:
                d_df['SS_class'].append('loop')
            else:
                raise ValueError("invalid SS of {0}".format(ss))
    return pandas.DataFrame(d_df)



if __name__ == '__main__':
    import doctest
    doctest.testmod()
Source code for dms_tools2.dssp

dms_tools2

Navigation

Related Topics