In [1]:
import warnings
import pandas as pd
import numpy as np
import altair as alt
from Bio.PDB import PDBParser
from scipy.stats import mannwhitneyu
from natsort import natsort_keygen
import theme
warnings.filterwarnings('ignore')
alt.data_transformers.disable_max_rows()
Out[1]:
DataTransformerRegistry.enable('default')
In [2]:
sitemap = pd.read_csv('../data/h3_site_numbering_map.csv')
struct_align = pd.read_csv('../results/structural_alignment/structural_alignment_detailed.csv')
js_df_h3_h5 = pd.read_csv('../results/divergence/h3_h5_divergence.csv')
js_df_h3_h7 = pd.read_csv('../results/divergence/h3_h7_divergence.csv')
js_df_h5_h7 = pd.read_csv('../results/divergence/h5_h7_divergence.csv')
Build contact map from 4o5n (H3) PDB¶
Contacts are defined as pairs of residues with Cβ–Cβ distance ≤ 8 Å. The H3 structure is used as the reference since all sites are numbered relative to H3. Both HA1 (chain A) and HA2 (chain B) are included to allow cross-domain contacts.
In [3]:
CONTACT_CUTOFF = 8.0
# Build mapping of struct_site -> (chain, pdb_residue_number) in 4o5n
site_to_pdb = {}
pdb_to_site = {}
for _, row in struct_align.iterrows():
if (
pd.notna(row['4o5n_aa_pdb_site'])
and pd.notna(row['4o5n_aa_chain'])
and row['4o5n_aa'] not in ['-', None, float('nan')]
):
chain = str(row['4o5n_aa_chain'])
pdb_res = int(row['4o5n_aa_pdb_site'])
site = str(row['struct_site'])
site_to_pdb[site] = (chain, pdb_res)
pdb_to_site[(chain, pdb_res)] = site
print(f"Sites with 4o5n PDB coordinates: {len(site_to_pdb)}")
# Load Cβ positions
parser = PDBParser(QUIET=True)
structure = parser.get_structure('4o5n', '../data/pdbs/4o5n.pdb')
model = list(structure)[0]
positions = {} # (chain, resnum) -> np.array
for chain_id in ['A', 'B']:
for residue in model[chain_id]:
if residue.id[0] != ' ': # skip HETATM
continue
atom = 'CB' if 'CB' in residue else ('CA' if 'CA' in residue else None)
if atom:
positions[(chain_id, residue.id[1])] = residue[atom].get_vector().get_array()
# Only keep positions for sites in our dataset
data_positions = {k: v for k, v in positions.items() if k in pdb_to_site}
print(f"Residues with positions in dataset: {len(data_positions)}")
# Compute all pairwise contacts
items = list(data_positions.items())
contacts = {} # (chain, res) -> set of (chain, res)
for i in range(len(items)):
k1, p1 = items[i]
for j in range(i + 1, len(items)):
k2, p2 = items[j]
if np.linalg.norm(p1 - p2) <= CONTACT_CUTOFF:
contacts.setdefault(k1, set()).add(k2)
contacts.setdefault(k2, set()).add(k1)
n_contacts = [len(v) for v in contacts.values()]
print(f"Sites with ≥1 contact: {len(contacts)}")
print(f"Avg contacts per site: {np.mean(n_contacts):.1f} (median: {np.median(n_contacts):.0f})")
Sites with 4o5n PDB coordinates: 490
Residues with positions in dataset: 490
Sites with ≥1 contact: 490 Avg contacts per site: 10.0 (median: 10)
Compute fraction of changed neighbors per site¶
In [4]:
def compute_neighborhood_changes(js_df, ha_x, ha_y, rsa_col='4o5n_aa_RSA', buried_cutoff=0.2):
"""
For each site in js_df, compute the fraction of structural neighbors
(within CONTACT_CUTOFF Å in 4o5n) whose wildtype residue changed
between ha_x and ha_y.
Returns an annotated copy of js_df with additional columns:
n_neighbors : number of neighbors with data in both HAs
n_changed_neighbors : number of those with a changed WT residue
frac_changed_neighbors: fraction of neighbors with changed WT
same_wildtype : whether this site's own WT residue changed
surface_exposed : 'Buried' / 'Exposed' based on 4o5n RSA
"""
# Determine which sites have changed WT between the two subtypes
wt_changed = {}
for _, row in js_df.iterrows():
site = str(row['struct_site'])
wx, wy = row[f'{ha_x}_wt_aa'], row[f'{ha_y}_wt_aa']
if pd.notna(wx) and pd.notna(wy):
wt_changed[site] = (wx != wy)
results = []
for _, row in js_df.iterrows():
site = str(row['struct_site'])
pdb_key = site_to_pdb.get(site)
if pdb_key is None:
continue
neighbor_keys = contacts.get(pdb_key, set())
neighbor_sites = [
pdb_to_site[k]
for k in neighbor_keys
if k in pdb_to_site and pdb_to_site[k] in wt_changed
]
n = len(neighbor_sites)
n_changed = sum(1 for s in neighbor_sites if wt_changed[s])
results.append({
'struct_site': site,
'n_neighbors': n,
'n_changed_neighbors': n_changed,
'frac_changed_neighbors': n_changed / n if n > 0 else np.nan,
})
result_df = pd.DataFrame(results)
merged = js_df.merge(result_df, on='struct_site').assign(
JSD=lambda x: x[f'JS_{ha_x}_vs_{ha_y}'],
same_wildtype=lambda x: np.where(
x[f'{ha_x}_wt_aa'] == x[f'{ha_y}_wt_aa'],
'Amino acid conserved',
'Amino acid changed',
),
surface_exposed=lambda x: np.where(
x[rsa_col] > buried_cutoff, 'Exposed', 'Buried'
),
)
return merged
nbhd_h3_h5 = compute_neighborhood_changes(js_df_h3_h5, 'h3', 'h5')
nbhd_h3_h7 = compute_neighborhood_changes(js_df_h3_h7, 'h3', 'h7')
nbhd_h5_h7 = compute_neighborhood_changes(js_df_h5_h7, 'h5', 'h7')
nbhd_h3_h5.head(3)
Out[4]:
| struct_site | h3_wt_aa | h5_wt_aa | rmsd_h3h5 | 4o5n_aa_RSA | JS_h3_vs_h5 | n_neighbors | n_changed_neighbors | frac_changed_neighbors | JSD | same_wildtype | surface_exposed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9 | S | K | 9.167400 | 1.084277 | 0.079776 | 4 | 3 | 0.750000 | 0.079776 | Amino acid changed | Exposed |
| 1 | 10 | T | S | 8.157247 | 0.150962 | 0.272808 | 12 | 7 | 0.583333 | 0.272808 | Amino acid changed | Buried |
| 2 | 11 | A | D | 5.040040 | 0.050388 | 0.347071 | 14 | 8 | 0.571429 | 0.347071 | Amino acid changed | Buried |
Divergence versus fraction of changed neighbors at buried sites¶
Among buried sites (RSA < 0.2), is divergence in amino-acid preferences associated with having more neighbors whose wildtype residue also changed?
In [5]:
def format_pvalue(p):
superscripts = str.maketrans("0123456789-", "⁰¹²³⁴⁵⁶⁷⁸⁹⁻")
if p < 1e-5:
return "p < 1 × 10⁻⁵"
elif p < 1e-3:
base, exp = f"{p:.0e}".split("e")
exp_str = str(int(exp)).translate(superscripts)
return f"p = {base} × 10{exp_str}"
else:
return f"p = {p:.2g}"
def plot_jsd_vs_frac_changed(nbhd_df, ha_x, ha_y, colors, width=175, height=150):
df = (
nbhd_df
.query('surface_exposed == "Buried"')
.dropna(subset=['JSD', 'frac_changed_neighbors'])
.reset_index(drop=True)
)
r = df['JSD'].corr(df['frac_changed_neighbors'], method='spearman')
r_text = f"r = {r:.2f}"
chart = alt.Chart(df).mark_circle(
size=30, opacity=0.8, strokeWidth=0.5, stroke='black'
).encode(
x=alt.X(
'frac_changed_neighbors:Q',
title=['Fraction of neighbors', 'with changed WT residue'],
scale=alt.Scale(domain=[0, 1]),
),
y=alt.Y('JSD:Q', title=['Divergence in amino-acid', 'preferences']),
color=alt.Color(
'same_wildtype:N',
title=None,
scale=alt.Scale(
domain=list(colors.keys()),
range=list(colors.values()),
),
legend=alt.Legend(orient='top', columns=1),
),
tooltip=[
'struct_site',
f'{ha_x}_wt_aa',
f'{ha_y}_wt_aa',
'JSD',
'frac_changed_neighbors',
'n_neighbors',
'n_changed_neighbors',
],
).properties(
width=width,
height=height,
title=alt.Title(
f'{ha_x.upper()} vs. {ha_y.upper()}',
fontSize=16,
anchor='middle',
),
)
r_label = alt.Chart(pd.DataFrame({'text': [r_text]})).mark_text(
align='left', baseline='top', fontSize=14, color='black'
).encode(
text='text:N',
x=alt.value(5),
y=alt.value(5),
)
return chart + r_label
colors = {
'Amino acid changed': '#5484AF',
'Amino acid conserved': '#E04948',
}
(
plot_jsd_vs_frac_changed(nbhd_h3_h5, 'h3', 'h5', colors)
| plot_jsd_vs_frac_changed(nbhd_h3_h7, 'h3', 'h7', colors)
| plot_jsd_vs_frac_changed(nbhd_h5_h7, 'h5', 'h7', colors)
).resolve_scale(color='shared')
Out[5]:
Do buried sites with changed WT have more changed neighbors?¶
Compare sites where the wildtype residue changed versus was conserved (restricted to buried sites, RSA < 0.2).
In [6]:
def plot_frac_changed_boxplot(nbhd_df, ha_x, ha_y, colors, box_size=30, width=175, height=150):
df = (
nbhd_df
.query('surface_exposed == "Buried"')
.dropna(subset=['frac_changed_neighbors'])
.reset_index(drop=True)
)
groups = df.groupby('same_wildtype')['frac_changed_neighbors'].apply(list)
# Mann-Whitney U: changed > conserved
stat, p = mannwhitneyu(
groups.get('Amino acid changed', []),
groups.get('Amino acid conserved', []),
alternative='greater',
)
p_text = format_pvalue(p)
label_expr = (
"datum.label == 'Amino acid changed' ? ['Amino acid', 'changed'] "
": datum.label == 'Amino acid conserved' ? ['Amino acid', 'conserved'] "
": datum.label"
)
boxplot = alt.Chart(df).mark_boxplot(
size=box_size, outliers=False
).encode(
x=alt.X(
'same_wildtype:N',
title=None,
axis=alt.Axis(labelAngle=0, labelExpr=label_expr),
),
y=alt.Y(
'frac_changed_neighbors:Q',
title=['Fraction of neighbors', 'with changed WT residue'],
),
color=alt.Color(
'same_wildtype:N',
legend=None,
scale=alt.Scale(
domain=list(colors.keys()),
range=list(colors.values()),
),
),
).properties(
width=width,
height=height,
title=alt.Title(
f'{ha_x.upper()} vs. {ha_y.upper()}',
fontSize=16,
anchor='middle',
),
)
p_label = alt.Chart(pd.DataFrame({'text': [p_text]})).mark_text(
align='center', baseline='top', fontSize=14, color='black'
).encode(
text='text:N',
y=alt.value(-15),
)
return boxplot + p_label
colors = {
'Amino acid changed': '#5484AF',
'Amino acid conserved': '#E04948',
}
(
plot_frac_changed_boxplot(nbhd_h3_h5, 'h3', 'h5', colors)
| plot_frac_changed_boxplot(nbhd_h3_h7, 'h3', 'h7', colors)
| plot_frac_changed_boxplot(nbhd_h5_h7, 'h5', 'h7', colors)
).resolve_scale(y='shared')
Out[6]: