In [1]:
import warnings
import pandas as pd
import numpy as np
import altair as alt
from Bio.PDB import PDBParser
from scipy.stats import mannwhitneyu
from natsort import natsort_keygen

import theme

warnings.filterwarnings('ignore')
alt.data_transformers.disable_max_rows()
Out[1]:
DataTransformerRegistry.enable('default')
In [2]:
sitemap = pd.read_csv('../data/h3_site_numbering_map.csv')
struct_align = pd.read_csv('../results/structural_alignment/structural_alignment_detailed.csv')

js_df_h3_h5 = pd.read_csv('../results/divergence/h3_h5_divergence.csv')
js_df_h3_h7 = pd.read_csv('../results/divergence/h3_h7_divergence.csv')
js_df_h5_h7 = pd.read_csv('../results/divergence/h5_h7_divergence.csv')

Build contact map from 4o5n (H3) PDB¶

Contacts are defined as pairs of residues with Cβ–Cβ distance ≤ 8 Å. The H3 structure is used as the reference since all sites are numbered relative to H3. Both HA1 (chain A) and HA2 (chain B) are included to allow cross-domain contacts.

In [3]:
CONTACT_CUTOFF = 8.0

# Build mapping of struct_site -> (chain, pdb_residue_number) in 4o5n
site_to_pdb = {}
pdb_to_site = {}
for _, row in struct_align.iterrows():
    if (
        pd.notna(row['4o5n_aa_pdb_site'])
        and pd.notna(row['4o5n_aa_chain'])
        and row['4o5n_aa'] not in ['-', None, float('nan')]
    ):
        chain = str(row['4o5n_aa_chain'])
        pdb_res = int(row['4o5n_aa_pdb_site'])
        site = str(row['struct_site'])
        site_to_pdb[site] = (chain, pdb_res)
        pdb_to_site[(chain, pdb_res)] = site

print(f"Sites with 4o5n PDB coordinates: {len(site_to_pdb)}")

# Load Cβ positions
parser = PDBParser(QUIET=True)
structure = parser.get_structure('4o5n', '../data/pdbs/4o5n.pdb')
model = list(structure)[0]

positions = {}  # (chain, resnum) -> np.array
for chain_id in ['A', 'B']:
    for residue in model[chain_id]:
        if residue.id[0] != ' ':  # skip HETATM
            continue
        atom = 'CB' if 'CB' in residue else ('CA' if 'CA' in residue else None)
        if atom:
            positions[(chain_id, residue.id[1])] = residue[atom].get_vector().get_array()

# Only keep positions for sites in our dataset
data_positions = {k: v for k, v in positions.items() if k in pdb_to_site}
print(f"Residues with positions in dataset: {len(data_positions)}")

# Compute all pairwise contacts
items = list(data_positions.items())
contacts = {}  # (chain, res) -> set of (chain, res)
for i in range(len(items)):
    k1, p1 = items[i]
    for j in range(i + 1, len(items)):
        k2, p2 = items[j]
        if np.linalg.norm(p1 - p2) <= CONTACT_CUTOFF:
            contacts.setdefault(k1, set()).add(k2)
            contacts.setdefault(k2, set()).add(k1)

n_contacts = [len(v) for v in contacts.values()]
print(f"Sites with ≥1 contact: {len(contacts)}")
print(f"Avg contacts per site: {np.mean(n_contacts):.1f} (median: {np.median(n_contacts):.0f})")
Sites with 4o5n PDB coordinates: 490
Residues with positions in dataset: 490
Sites with ≥1 contact: 490
Avg contacts per site: 10.0 (median: 10)

Compute fraction of changed neighbors per site¶

In [4]:
def compute_neighborhood_changes(js_df, ha_x, ha_y, rsa_col='4o5n_aa_RSA', buried_cutoff=0.2):
    """
    For each site in js_df, compute the fraction of structural neighbors
    (within CONTACT_CUTOFF Å in 4o5n) whose wildtype residue changed
    between ha_x and ha_y.

    Returns an annotated copy of js_df with additional columns:
      n_neighbors          : number of neighbors with data in both HAs
      n_changed_neighbors  : number of those with a changed WT residue
      frac_changed_neighbors: fraction of neighbors with changed WT
      same_wildtype        : whether this site's own WT residue changed
      surface_exposed      : 'Buried' / 'Exposed' based on 4o5n RSA
    """
    # Determine which sites have changed WT between the two subtypes
    wt_changed = {}
    for _, row in js_df.iterrows():
        site = str(row['struct_site'])
        wx, wy = row[f'{ha_x}_wt_aa'], row[f'{ha_y}_wt_aa']
        if pd.notna(wx) and pd.notna(wy):
            wt_changed[site] = (wx != wy)

    results = []
    for _, row in js_df.iterrows():
        site = str(row['struct_site'])
        pdb_key = site_to_pdb.get(site)
        if pdb_key is None:
            continue
        neighbor_keys = contacts.get(pdb_key, set())
        neighbor_sites = [
            pdb_to_site[k]
            for k in neighbor_keys
            if k in pdb_to_site and pdb_to_site[k] in wt_changed
        ]
        n = len(neighbor_sites)
        n_changed = sum(1 for s in neighbor_sites if wt_changed[s])
        results.append({
            'struct_site': site,
            'n_neighbors': n,
            'n_changed_neighbors': n_changed,
            'frac_changed_neighbors': n_changed / n if n > 0 else np.nan,
        })

    result_df = pd.DataFrame(results)
    merged = js_df.merge(result_df, on='struct_site').assign(
        JSD=lambda x: x[f'JS_{ha_x}_vs_{ha_y}'],
        same_wildtype=lambda x: np.where(
            x[f'{ha_x}_wt_aa'] == x[f'{ha_y}_wt_aa'],
            'Amino acid conserved',
            'Amino acid changed',
        ),
        surface_exposed=lambda x: np.where(
            x[rsa_col] > buried_cutoff, 'Exposed', 'Buried'
        ),
    )
    return merged


nbhd_h3_h5 = compute_neighborhood_changes(js_df_h3_h5, 'h3', 'h5')
nbhd_h3_h7 = compute_neighborhood_changes(js_df_h3_h7, 'h3', 'h7')
nbhd_h5_h7 = compute_neighborhood_changes(js_df_h5_h7, 'h5', 'h7')

nbhd_h3_h5.head(3)
Out[4]:
struct_site h3_wt_aa h5_wt_aa rmsd_h3h5 4o5n_aa_RSA JS_h3_vs_h5 n_neighbors n_changed_neighbors frac_changed_neighbors JSD same_wildtype surface_exposed
0 9 S K 9.167400 1.084277 0.079776 4 3 0.750000 0.079776 Amino acid changed Exposed
1 10 T S 8.157247 0.150962 0.272808 12 7 0.583333 0.272808 Amino acid changed Buried
2 11 A D 5.040040 0.050388 0.347071 14 8 0.571429 0.347071 Amino acid changed Buried

Divergence versus fraction of changed neighbors at buried sites¶

Among buried sites (RSA < 0.2), is divergence in amino-acid preferences associated with having more neighbors whose wildtype residue also changed?

In [5]:
def format_pvalue(p):
    superscripts = str.maketrans("0123456789-", "⁰¹²³⁴⁵⁶⁷⁸⁹⁻")
    if p < 1e-5:
        return "p < 1 × 10⁻⁵"
    elif p < 1e-3:
        base, exp = f"{p:.0e}".split("e")
        exp_str = str(int(exp)).translate(superscripts)
        return f"p = {base} × 10{exp_str}"
    else:
        return f"p = {p:.2g}"


def plot_jsd_vs_frac_changed(nbhd_df, ha_x, ha_y, colors, width=175, height=150):
    df = (
        nbhd_df
        .query('surface_exposed == "Buried"')
        .dropna(subset=['JSD', 'frac_changed_neighbors'])
        .reset_index(drop=True)
    )

    r = df['JSD'].corr(df['frac_changed_neighbors'], method='spearman')
    r_text = f"r = {r:.2f}"

    chart = alt.Chart(df).mark_circle(
        size=30, opacity=0.8, strokeWidth=0.5, stroke='black'
    ).encode(
        x=alt.X(
            'frac_changed_neighbors:Q',
            title=['Fraction of neighbors', 'with changed WT residue'],
            scale=alt.Scale(domain=[0, 1]),
        ),
        y=alt.Y('JSD:Q', title=['Divergence in amino-acid', 'preferences']),
        color=alt.Color(
            'same_wildtype:N',
            title=None,
            scale=alt.Scale(
                domain=list(colors.keys()),
                range=list(colors.values()),
            ),
            legend=alt.Legend(orient='top', columns=1),
        ),
        tooltip=[
            'struct_site',
            f'{ha_x}_wt_aa',
            f'{ha_y}_wt_aa',
            'JSD',
            'frac_changed_neighbors',
            'n_neighbors',
            'n_changed_neighbors',
        ],
    ).properties(
        width=width,
        height=height,
        title=alt.Title(
            f'{ha_x.upper()} vs. {ha_y.upper()}',
            fontSize=16,
            anchor='middle',
        ),
    )

    r_label = alt.Chart(pd.DataFrame({'text': [r_text]})).mark_text(
        align='left', baseline='top', fontSize=14, color='black'
    ).encode(
        text='text:N',
        x=alt.value(5),
        y=alt.value(5),
    )

    return chart + r_label


colors = {
    'Amino acid changed': '#5484AF',
    'Amino acid conserved': '#E04948',
}

(
    plot_jsd_vs_frac_changed(nbhd_h3_h5, 'h3', 'h5', colors)
    | plot_jsd_vs_frac_changed(nbhd_h3_h7, 'h3', 'h7', colors)
    | plot_jsd_vs_frac_changed(nbhd_h5_h7, 'h5', 'h7', colors)
).resolve_scale(color='shared')
Out[5]:

Do buried sites with changed WT have more changed neighbors?¶

Compare sites where the wildtype residue changed versus was conserved (restricted to buried sites, RSA < 0.2).

In [6]:
def plot_frac_changed_boxplot(nbhd_df, ha_x, ha_y, colors, box_size=30, width=175, height=150):
    df = (
        nbhd_df
        .query('surface_exposed == "Buried"')
        .dropna(subset=['frac_changed_neighbors'])
        .reset_index(drop=True)
    )

    groups = df.groupby('same_wildtype')['frac_changed_neighbors'].apply(list)
    # Mann-Whitney U: changed > conserved
    stat, p = mannwhitneyu(
        groups.get('Amino acid changed', []),
        groups.get('Amino acid conserved', []),
        alternative='greater',
    )
    p_text = format_pvalue(p)

    label_expr = (
        "datum.label == 'Amino acid changed' ? ['Amino acid', 'changed'] "
        ": datum.label == 'Amino acid conserved' ? ['Amino acid', 'conserved'] "
        ": datum.label"
    )

    boxplot = alt.Chart(df).mark_boxplot(
        size=box_size, outliers=False
    ).encode(
        x=alt.X(
            'same_wildtype:N',
            title=None,
            axis=alt.Axis(labelAngle=0, labelExpr=label_expr),
        ),
        y=alt.Y(
            'frac_changed_neighbors:Q',
            title=['Fraction of neighbors', 'with changed WT residue'],
        ),
        color=alt.Color(
            'same_wildtype:N',
            legend=None,
            scale=alt.Scale(
                domain=list(colors.keys()),
                range=list(colors.values()),
            ),
        ),
    ).properties(
        width=width,
        height=height,
        title=alt.Title(
            f'{ha_x.upper()} vs. {ha_y.upper()}',
            fontSize=16,
            anchor='middle',
        ),
    )

    p_label = alt.Chart(pd.DataFrame({'text': [p_text]})).mark_text(
        align='center', baseline='top', fontSize=14, color='black'
    ).encode(
        text='text:N',
        y=alt.value(-15),
    )

    return boxplot + p_label


colors = {
    'Amino acid changed': '#5484AF',
    'Amino acid conserved': '#E04948',
}

(
    plot_frac_changed_boxplot(nbhd_h3_h5, 'h3', 'h5', colors)
    | plot_frac_changed_boxplot(nbhd_h3_h7, 'h3', 'h7', colors)
    | plot_frac_changed_boxplot(nbhd_h5_h7, 'h5', 'h7', colors)
).resolve_scale(y='shared')
Out[6]: