Source code for dms_variants.pdb_utils

"""
===========
pdb_utils
===========

Functions to manipulate `PDB <https://www.rcsb.org/>`_ files.

"""

import collections  # noqa: F401
import os  # noqa: F401
import tempfile  # noqa: F401
import warnings

import Bio.PDB

import pandas as pd  # noqa: F401

import requests  # noqa: F401


[docs] def reassign_b_factor( input_pdbfile, output_pdbfile, df, metric_col, *, site_col="site", chain_col="chain", missing_metric=0, model_index=0, ): r"""Reassign B factors in PDB file to some other metric. B-factor re-assignment is useful because PDB images can be colored by B factor using programs such as ``pymol`` using commands like:: show surface, RBD; spectrum b, white red, RBD, minimum=0, maximum=1 Parameters ---------- input_pdbfile : str Path to input PDB file. output_pdbfile: str Name of created output PDB file with re-assigned B factors. df : pandas.DataFrame Data frame with metric used to re-assign B factor. metric_col : str Name of column in `df` that has the numerical metric that the B factor is re-assigned to. site_col : str Name of column in `df` with site numbers, which should map numbers used in PDB. chain_col : str Name of column in `df` with chain labels. missing_metric : float or dict How do we handl sites that are missing in `df`? If a float, reassign B factors for all missing sites to this value. If a dict, should be keyed by chain and assign all missing sites in each chain to indicated value. model_index : int Which model in the PDB to use. If a X-ray structure, there is probably just one model so you can use default of 0. Returns ------- None Example ------- Create data frame `df` that assigns metric to two sites in chain E: >>> df = pd.DataFrame({'chain': ['E', 'E'], ... 'site': [333, 334], ... 'metric': [0.5, 1.2]}) Create dict `missing_metric` that assigns -1 to sites with missing metrics in chain E, and 0 to sites in other chains: >>> missing_metric = collections.defaultdict(lambda: 0) >>> missing_metric['E'] = -1 Download PDB, do the re-assignment of B factors, read the lines from the resulting re-assigned PDB: >>> pdb_url = 'https://files.rcsb.org/download/6M0J.pdb' >>> r = requests.get(pdb_url) >>> with tempfile.TemporaryDirectory() as tmpdir: ... original_pdbfile = os.path.join(tmpdir, 'original.pdb') ... with open(original_pdbfile, 'wb') as f: ... _ = f.write(r.content) ... reassigned_pdbfile = os.path.join(tmpdir, 'reassigned.pdb') ... reassign_b_factor(input_pdbfile=original_pdbfile, ... output_pdbfile=reassigned_pdbfile, ... df=df, ... metric_col='metric', ... missing_metric=missing_metric) ... pdb_text = open(reassigned_pdbfile).readlines() Now spot check some key lines in the output PDB. Chain A has all sites with B factors (last entry) re-assigned to 0: >>> print(pdb_text[0].strip()) ATOM 1 N SER A 19 -31.455 49.474 2.505 1.00 0.00 N Chain E has sites 333 and 334 with B-factors assigned to values in `df`, and other sites (such as 335) assigned to -1: >>> print('\n'.join(line.strip() for line in pdb_text[5010: 5025])) ATOM 5010 O THR E 333 -34.954 13.568 46.370 1.00 0.50 O ATOM 5011 CB THR E 333 -33.695 14.409 48.627 1.00 0.50 C ATOM 5012 OG1 THR E 333 -34.797 14.149 49.507 1.00 0.50 O ATOM 5013 CG2 THR E 333 -32.495 14.879 49.438 1.00 0.50 C ATOM 5014 N ASN E 334 -35.532 15.604 45.605 1.00 1.20 N ATOM 5015 CA ASN E 334 -36.287 15.087 44.474 1.00 1.20 C ATOM 5016 C ASN E 334 -35.475 15.204 43.182 1.00 1.20 C ATOM 5017 O ASN E 334 -34.533 15.994 43.076 1.00 1.20 O ATOM 5018 CB ASN E 334 -37.622 15.823 44.337 1.00 1.20 C ATOM 5019 CG ASN E 334 -38.660 15.006 43.586 1.00 1.20 C ATOM 5020 OD1 ASN E 334 -38.568 13.776 43.514 1.00 1.20 O ATOM 5021 ND2 ASN E 334 -39.649 15.686 43.016 1.00 1.20 N ATOM 5022 N LEU E 335 -35.849 14.391 42.194 1.00 -1.00 N ATOM 5023 CA LEU E 335 -35.084 14.305 40.955 1.00 -1.00 C ATOM 5024 C LEU E 335 -35.466 15.426 39.992 1.00 -1.00 C """ # noqa: E501 # subset `df` to needed columns and error check it cols = [metric_col, site_col, chain_col] for col in cols: if col not in df.columns: raise ValueError(f"`df` lacks column {col}") df = df[cols].drop_duplicates() if len(df) != len(df.groupby([site_col, chain_col])): raise ValueError("non-unique metric for a site in a chain") if df[site_col].dtype != int: raise ValueError("function currently requires `site_col` to be int") # read PDB, catch warnings about discontinuous chains with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=Bio.PDB.PDBExceptions.PDBConstructionWarning ) pdb = Bio.PDB.PDBParser().get_structure("_", input_pdbfile) # get the model out of the PDB model = list(pdb.get_models())[model_index] # make sure all chains in PDB missing_chains = set(df[chain_col]) - {chain.id for chain in model.get_chains()} if missing_chains: raise ValueError(f"`df` has chains not in PDB: {missing_chains}") # make missing_metric a dict if it isn't already if not isinstance(missing_metric, dict): missing_metric = {chain.id: missing_metric for chain in model.get_chains()} # loop over all chains and do coloring for chain in model.get_chains(): chain_id = chain.id site_to_val = ( df.query(f"{chain_col} == @chain_id") .set_index(site_col)[metric_col] .to_dict() ) for residue in chain: site = residue.get_id()[1] try: metric_val = site_to_val[site] except KeyError: metric_val = missing_metric[chain_id] # for disordered residues, get list of them try: residuelist = residue.disordered_get_list() except AttributeError: residuelist = [residue] for r in residuelist: for atom in r: # for disordered atoms, get list of them try: atomlist = atom.disordered_get_list() except AttributeError: atomlist = [atom] for a in atomlist: a.bfactor = metric_val # write PDB io = Bio.PDB.PDBIO() io.set_structure(pdb) io.save(output_pdbfile)
if __name__ == "__main__": import doctest doctest.testmod()