import warnings
import pandas as pd
import numpy as np
import altair as alt
from Bio.PDB import PDBParser
from scipy.stats import mannwhitneyu
from natsort import natsort_keygen

import theme

warnings.filterwarnings('ignore')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

sitemap = pd.read_csv('../data/h3_site_numbering_map.csv')
struct_align = pd.read_csv('../results/structural_alignment/structural_alignment_detailed.csv')

js_df_h3_h5 = pd.read_csv('../results/divergence/h3_h5_divergence.csv')
js_df_h3_h7 = pd.read_csv('../results/divergence/h3_h7_divergence.csv')
js_df_h5_h7 = pd.read_csv('../results/divergence/h5_h7_divergence.csv')

CONTACT_CUTOFF = 8.0

# Build mapping of struct_site -> (chain, pdb_residue_number) in 4o5n
site_to_pdb = {}
pdb_to_site = {}
for _, row in struct_align.iterrows():
    if (
        pd.notna(row['4o5n_aa_pdb_site'])
        and pd.notna(row['4o5n_aa_chain'])
        and row['4o5n_aa'] not in ['-', None, float('nan')]
    ):
        chain = str(row['4o5n_aa_chain'])
        pdb_res = int(row['4o5n_aa_pdb_site'])
        site = str(row['struct_site'])
        site_to_pdb[site] = (chain, pdb_res)
        pdb_to_site[(chain, pdb_res)] = site

print(f"Sites with 4o5n PDB coordinates: {len(site_to_pdb)}")

# Load Cβ positions
parser = PDBParser(QUIET=True)
structure = parser.get_structure('4o5n', '../data/pdbs/4o5n.pdb')
model = list(structure)[0]

positions = {}  # (chain, resnum) -> np.array
for chain_id in ['A', 'B']:
    for residue in model[chain_id]:
        if residue.id[0] != ' ':  # skip HETATM
            continue
        atom = 'CB' if 'CB' in residue else ('CA' if 'CA' in residue else None)
        if atom:
            positions[(chain_id, residue.id[1])] = residue[atom].get_vector().get_array()

# Only keep positions for sites in our dataset
data_positions = {k: v for k, v in positions.items() if k in pdb_to_site}
print(f"Residues with positions in dataset: {len(data_positions)}")

# Compute all pairwise contacts
items = list(data_positions.items())
contacts = {}  # (chain, res) -> set of (chain, res)
for i in range(len(items)):
    k1, p1 = items[i]
    for j in range(i + 1, len(items)):
        k2, p2 = items[j]
        if np.linalg.norm(p1 - p2) <= CONTACT_CUTOFF:
            contacts.setdefault(k1, set()).add(k2)
            contacts.setdefault(k2, set()).add(k1)

n_contacts = [len(v) for v in contacts.values()]
print(f"Sites with ≥1 contact: {len(contacts)}")
print(f"Avg contacts per site: {np.mean(n_contacts):.1f} (median: {np.median(n_contacts):.0f})")

Sites with 4o5n PDB coordinates: 490

Residues with positions in dataset: 490

Sites with ≥1 contact: 490
Avg contacts per site: 10.0 (median: 10)

def compute_neighborhood_changes(js_df, ha_x, ha_y, rsa_col='4o5n_aa_RSA', buried_cutoff=0.2):
    """
    For each site in js_df, compute the fraction of structural neighbors
    (within CONTACT_CUTOFF Å in 4o5n) whose wildtype residue changed
    between ha_x and ha_y.

    Returns an annotated copy of js_df with additional columns:
      n_neighbors          : number of neighbors with data in both HAs
      n_changed_neighbors  : number of those with a changed WT residue
      frac_changed_neighbors: fraction of neighbors with changed WT
      same_wildtype        : whether this site's own WT residue changed
      surface_exposed      : 'Buried' / 'Exposed' based on 4o5n RSA
    """
    # Determine which sites have changed WT between the two subtypes
    wt_changed = {}
    for _, row in js_df.iterrows():
        site = str(row['struct_site'])
        wx, wy = row[f'{ha_x}_wt_aa'], row[f'{ha_y}_wt_aa']
        if pd.notna(wx) and pd.notna(wy):
            wt_changed[site] = (wx != wy)

    results = []
    for _, row in js_df.iterrows():
        site = str(row['struct_site'])
        pdb_key = site_to_pdb.get(site)
        if pdb_key is None:
            continue
        neighbor_keys = contacts.get(pdb_key, set())
        neighbor_sites = [
            pdb_to_site[k]
            for k in neighbor_keys
            if k in pdb_to_site and pdb_to_site[k] in wt_changed
        ]
        n = len(neighbor_sites)
        n_changed = sum(1 for s in neighbor_sites if wt_changed[s])
        results.append({
            'struct_site': site,
            'n_neighbors': n,
            'n_changed_neighbors': n_changed,
            'frac_changed_neighbors': n_changed / n if n > 0 else np.nan,
        })

    result_df = pd.DataFrame(results)
    merged = js_df.merge(result_df, on='struct_site').assign(
        JSD=lambda x: x[f'JS_{ha_x}_vs_{ha_y}'],
        same_wildtype=lambda x: np.where(
            x[f'{ha_x}_wt_aa'] == x[f'{ha_y}_wt_aa'],
            'Amino acid conserved',
            'Amino acid changed',
        ),
        surface_exposed=lambda x: np.where(
            x[rsa_col] > buried_cutoff, 'Exposed', 'Buried'
        ),
    )
    return merged


nbhd_h3_h5 = compute_neighborhood_changes(js_df_h3_h5, 'h3', 'h5')
nbhd_h3_h7 = compute_neighborhood_changes(js_df_h3_h7, 'h3', 'h7')
nbhd_h5_h7 = compute_neighborhood_changes(js_df_h5_h7, 'h5', 'h7')

nbhd_h3_h5.head(3)

def format_pvalue(p):
    superscripts = str.maketrans("0123456789-", "⁰¹²³⁴⁵⁶⁷⁸⁹⁻")
    if p < 1e-5:
        return "p < 1 × 10⁻⁵"
    elif p < 1e-3:
        base, exp = f"{p:.0e}".split("e")
        exp_str = str(int(exp)).translate(superscripts)
        return f"p = {base} × 10{exp_str}"
    else:
        return f"p = {p:.2g}"


def plot_jsd_vs_frac_changed(nbhd_df, ha_x, ha_y, colors, width=175, height=150):
    df = (
        nbhd_df
        .query('surface_exposed == "Buried"')
        .dropna(subset=['JSD', 'frac_changed_neighbors'])
        .reset_index(drop=True)
    )

    r = df['JSD'].corr(df['frac_changed_neighbors'], method='spearman')
    r_text = f"r = {r:.2f}"

    chart = alt.Chart(df).mark_circle(
        size=30, opacity=0.8, strokeWidth=0.5, stroke='black'
    ).encode(
        x=alt.X(
            'frac_changed_neighbors:Q',
            title=['Fraction of neighbors', 'with changed WT residue'],
            scale=alt.Scale(domain=[0, 1]),
        ),
        y=alt.Y('JSD:Q', title=['Divergence in amino-acid', 'preferences']),
        color=alt.Color(
            'same_wildtype:N',
            title=None,
            scale=alt.Scale(
                domain=list(colors.keys()),
                range=list(colors.values()),
            ),
            legend=alt.Legend(orient='top', columns=1),
        ),
        tooltip=[
            'struct_site',
            f'{ha_x}_wt_aa',
            f'{ha_y}_wt_aa',
            'JSD',
            'frac_changed_neighbors',
            'n_neighbors',
            'n_changed_neighbors',
        ],
    ).properties(
        width=width,
        height=height,
        title=alt.Title(
            f'{ha_x.upper()} vs. {ha_y.upper()}',
            fontSize=16,
            anchor='middle',
        ),
    )

    r_label = alt.Chart(pd.DataFrame({'text': [r_text]})).mark_text(
        align='left', baseline='top', fontSize=14, color='black'
    ).encode(
        text='text:N',
        x=alt.value(5),
        y=alt.value(5),
    )

    return chart + r_label


colors = {
    'Amino acid changed': '#5484AF',
    'Amino acid conserved': '#E04948',
}

(
    plot_jsd_vs_frac_changed(nbhd_h3_h5, 'h3', 'h5', colors)
    | plot_jsd_vs_frac_changed(nbhd_h3_h7, 'h3', 'h7', colors)
    | plot_jsd_vs_frac_changed(nbhd_h5_h7, 'h5', 'h7', colors)
).resolve_scale(color='shared')

def plot_frac_changed_boxplot(nbhd_df, ha_x, ha_y, colors, box_size=30, width=175, height=150):
    df = (
        nbhd_df
        .query('surface_exposed == "Buried"')
        .dropna(subset=['frac_changed_neighbors'])
        .reset_index(drop=True)
    )

    groups = df.groupby('same_wildtype')['frac_changed_neighbors'].apply(list)
    # Mann-Whitney U: changed > conserved
    stat, p = mannwhitneyu(
        groups.get('Amino acid changed', []),
        groups.get('Amino acid conserved', []),
        alternative='greater',
    )
    p_text = format_pvalue(p)

    label_expr = (
        "datum.label == 'Amino acid changed' ? ['Amino acid', 'changed'] "
        ": datum.label == 'Amino acid conserved' ? ['Amino acid', 'conserved'] "
        ": datum.label"
    )

    boxplot = alt.Chart(df).mark_boxplot(
        size=box_size, outliers=False
    ).encode(
        x=alt.X(
            'same_wildtype:N',
            title=None,
            axis=alt.Axis(labelAngle=0, labelExpr=label_expr),
        ),
        y=alt.Y(
            'frac_changed_neighbors:Q',
            title=['Fraction of neighbors', 'with changed WT residue'],
        ),
        color=alt.Color(
            'same_wildtype:N',
            legend=None,
            scale=alt.Scale(
                domain=list(colors.keys()),
                range=list(colors.values()),
            ),
        ),
    ).properties(
        width=width,
        height=height,
        title=alt.Title(
            f'{ha_x.upper()} vs. {ha_y.upper()}',
            fontSize=16,
            anchor='middle',
        ),
    )

    p_label = alt.Chart(pd.DataFrame({'text': [p_text]})).mark_text(
        align='center', baseline='top', fontSize=14, color='black'
    ).encode(
        text='text:N',
        y=alt.value(-15),
    )

    return boxplot + p_label


colors = {
    'Amino acid changed': '#5484AF',
    'Amino acid conserved': '#E04948',
}

(
    plot_frac_changed_boxplot(nbhd_h3_h5, 'h3', 'h5', colors)
    | plot_frac_changed_boxplot(nbhd_h3_h7, 'h3', 'h7', colors)
    | plot_frac_changed_boxplot(nbhd_h5_h7, 'h5', 'h7', colors)
).resolve_scale(y='shared')

	struct_site	h3_wt_aa	h5_wt_aa	rmsd_h3h5	4o5n_aa_RSA	JS_h3_vs_h5	n_neighbors	n_changed_neighbors	frac_changed_neighbors	JSD	same_wildtype	surface_exposed
0	9	S	K	9.167400	1.084277	0.079776	4	3	0.750000	0.079776	Amino acid changed	Exposed
1	10	T	S	8.157247	0.150962	0.272808	12	7	0.583333	0.272808	Amino acid changed	Buried
2	11	A	D	5.040040	0.050388	0.347071	14	8	0.571429	0.347071	Amino acid changed	Buried

Build contact map from 4o5n (H3) PDB¶

Compute fraction of changed neighbors per site¶

Divergence versus fraction of changed neighbors at buried sites¶

Do buried sites with changed WT have more changed neighbors?¶