Source code for dms_variants.pdb_utils

"""
===========
pdb_utils
===========

Functions to manipulate `PDB <https://www.rcsb.org/>`_ files.

"""

import collections  # noqa: F401
import os  # noqa: F401
import tempfile  # noqa: F401
import warnings

import Bio.PDB

import pandas as pd  # noqa: F401

import requests  # noqa: F401



[docs]
def reassign_b_factor(
    input_pdbfile,
    output_pdbfile,
    df,
    metric_col,
    *,
    site_col="site",
    chain_col="chain",
    missing_metric=0,
    model_index=0,
):
    r"""Reassign B factors in PDB file to some other metric.

    B-factor re-assignment is useful because PDB images can be colored
    by B factor using programs such as ``pymol`` using commands like::

        show surface, RBD; spectrum b, white red, RBD, minimum=0, maximum=1

    Parameters
    ----------
    input_pdbfile : str
        Path to input PDB file.
    output_pdbfile: str
        Name of created output PDB file with re-assigned B factors.
    df : pandas.DataFrame
        Data frame with metric used to re-assign B factor.
    metric_col : str
        Name of column in `df` that has the numerical metric that the B
        factor is re-assigned to.
    site_col : str
        Name of column in `df` with site numbers, which should map numbers
        used in PDB.
    chain_col : str
        Name of column in `df` with chain labels.
    missing_metric : float or dict
        How do we handl sites that are missing in `df`? If a float, reassign
        B factors for all missing sites to this value. If a dict, should be
        keyed by chain and assign all missing sites in each chain to
        indicated value.
    model_index : int
        Which model in the PDB to use. If a X-ray structure, there is
        probably just one model so you can use default of 0.

    Returns
    -------
    None

    Example
    -------
    Create data frame `df` that assigns metric to two sites in chain E:

    >>> df = pd.DataFrame({'chain': ['E', 'E'],
    ...                    'site': [333, 334],
    ...                    'metric': [0.5, 1.2]})

    Create dict `missing_metric` that assigns -1 to sites with missing
    metrics in chain E, and 0 to sites in other chains:

    >>> missing_metric = collections.defaultdict(lambda: 0)
    >>> missing_metric['E'] = -1

    Download PDB, do the re-assignment of B factors, read the lines
    from the resulting re-assigned PDB:

    >>> pdb_url = 'https://files.rcsb.org/download/6M0J.pdb'
    >>> r = requests.get(pdb_url)
    >>> with tempfile.TemporaryDirectory() as tmpdir:
    ...    original_pdbfile = os.path.join(tmpdir, 'original.pdb')
    ...    with open(original_pdbfile, 'wb') as f:
    ...        _ = f.write(r.content)
    ...    reassigned_pdbfile = os.path.join(tmpdir, 'reassigned.pdb')
    ...    reassign_b_factor(input_pdbfile=original_pdbfile,
    ...                      output_pdbfile=reassigned_pdbfile,
    ...                      df=df,
    ...                      metric_col='metric',
    ...                      missing_metric=missing_metric)
    ...    pdb_text = open(reassigned_pdbfile).readlines()

    Now spot check some key lines in the output PDB.
    Chain A has all sites with B factors (last entry) re-assigned to 0:

    >>> print(pdb_text[0].strip())
    ATOM      1  N   SER A  19     -31.455  49.474   2.505  1.00  0.00           N

    Chain E has sites 333 and 334 with B-factors assigned to values in `df`, and
    other sites (such as 335) assigned to -1:

    >>> print('\n'.join(line.strip() for line in pdb_text[5010: 5025]))
    ATOM   5010  O   THR E 333     -34.954  13.568  46.370  1.00  0.50           O
    ATOM   5011  CB  THR E 333     -33.695  14.409  48.627  1.00  0.50           C
    ATOM   5012  OG1 THR E 333     -34.797  14.149  49.507  1.00  0.50           O
    ATOM   5013  CG2 THR E 333     -32.495  14.879  49.438  1.00  0.50           C
    ATOM   5014  N   ASN E 334     -35.532  15.604  45.605  1.00  1.20           N
    ATOM   5015  CA  ASN E 334     -36.287  15.087  44.474  1.00  1.20           C
    ATOM   5016  C   ASN E 334     -35.475  15.204  43.182  1.00  1.20           C
    ATOM   5017  O   ASN E 334     -34.533  15.994  43.076  1.00  1.20           O
    ATOM   5018  CB  ASN E 334     -37.622  15.823  44.337  1.00  1.20           C
    ATOM   5019  CG  ASN E 334     -38.660  15.006  43.586  1.00  1.20           C
    ATOM   5020  OD1 ASN E 334     -38.568  13.776  43.514  1.00  1.20           O
    ATOM   5021  ND2 ASN E 334     -39.649  15.686  43.016  1.00  1.20           N
    ATOM   5022  N   LEU E 335     -35.849  14.391  42.194  1.00 -1.00           N
    ATOM   5023  CA  LEU E 335     -35.084  14.305  40.955  1.00 -1.00           C
    ATOM   5024  C   LEU E 335     -35.466  15.426  39.992  1.00 -1.00           C

    """  # noqa: E501
    # subset `df` to needed columns and error check it
    cols = [metric_col, site_col, chain_col]
    for col in cols:
        if col not in df.columns:
            raise ValueError(f"`df` lacks column {col}")
    df = df[cols].drop_duplicates()
    if len(df) != len(df.groupby([site_col, chain_col])):
        raise ValueError("non-unique metric for a site in a chain")

    if df[site_col].dtype != int:
        raise ValueError("function currently requires `site_col` to be int")

    # read PDB, catch warnings about discontinuous chains
    with warnings.catch_warnings():
        warnings.simplefilter(
            "ignore", category=Bio.PDB.PDBExceptions.PDBConstructionWarning
        )
        pdb = Bio.PDB.PDBParser().get_structure("_", input_pdbfile)

    # get the model out of the PDB
    model = list(pdb.get_models())[model_index]

    # make sure all chains in PDB
    missing_chains = set(df[chain_col]) - {chain.id for chain in model.get_chains()}
    if missing_chains:
        raise ValueError(f"`df` has chains not in PDB: {missing_chains}")

    # make missing_metric a dict if it isn't already
    if not isinstance(missing_metric, dict):
        missing_metric = {chain.id: missing_metric for chain in model.get_chains()}

    # loop over all chains and do coloring
    for chain in model.get_chains():
        chain_id = chain.id
        site_to_val = (
            df.query(f"{chain_col} == @chain_id")
            .set_index(site_col)[metric_col]
            .to_dict()
        )
        for residue in chain:
            site = residue.get_id()[1]
            try:
                metric_val = site_to_val[site]
            except KeyError:
                metric_val = missing_metric[chain_id]
            # for disordered residues, get list of them
            try:
                residuelist = residue.disordered_get_list()
            except AttributeError:
                residuelist = [residue]
            for r in residuelist:
                for atom in r:
                    # for disordered atoms, get list of them
                    try:
                        atomlist = atom.disordered_get_list()
                    except AttributeError:
                        atomlist = [atom]
                    for a in atomlist:
                        a.bfactor = metric_val

    # write PDB
    io = Bio.PDB.PDBIO()
    io.set_structure(pdb)
    io.save(output_pdbfile)



if __name__ == "__main__":
    import doctest

    doctest.testmod()
Source code for dms_variants.pdb_utils

dms_variants

Navigation

Related Topics