import pandas as pd
import numpy as np
import altair as alt

from Bio import AlignIO
from natsort import natsort_keygen

import theme

def plot_identity_heatmap(df, title, width=100, height=100, order=None, highlight_pairs=None):
    """
    Plot a lower-triangle heatmap of sequence identity.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame with columns 'ha_x', 'ha_y', and 'percent_identity'
    title : str
        Title for the plot
    width : int
        Width of the plot
    height : int
        Height of the plot
    order : list, optional
        Order of HA subtypes for axes
    highlight_pairs : list of tuples, optional
        List of (ha_x, ha_y) pairs to highlight in red, e.g., [('H3', 'H5'), ('H3', 'H7')]
    """
    if order is None:
        order = sorted(df['ha_x'].unique(), key=natsort_keygen())

    # Filter to lower triangle only (excluding diagonal)
    # For lower triangle: index of ha_y > index of ha_x
    order_dict = {ha: i for i, ha in enumerate(order)}
    df_filtered = df[
        df.apply(lambda row: order_dict[row['ha_y']] > order_dict[row['ha_x']], axis=1)
    ].copy()
    
    # Add highlight column if highlight_pairs is specified
    if highlight_pairs is not None:
        # Normalize pairs to ensure they're in the right order (lower triangle)
        normalized_pairs = set()
        for ha_x, ha_y in highlight_pairs:
            # Make sure we're using the lower triangle ordering
            if order_dict[ha_y] > order_dict[ha_x]:
                normalized_pairs.add((ha_x, ha_y))
            else:
                normalized_pairs.add((ha_y, ha_x))
        
        df_filtered['highlighted'] = df_filtered.apply(
            lambda row: (row['ha_x'], row['ha_y']) in normalized_pairs,
            axis=1
        )
    else:
        df_filtered['highlighted'] = False

    # Split data into highlighted and non-highlighted
    df_normal = df_filtered[~df_filtered['highlighted']]
    df_highlight = df_filtered[df_filtered['highlighted']]

    # Create base heatmap for non-highlighted cells
    heatmap_normal = alt.Chart(df_normal).mark_rect(
        opacity=0.95, stroke='black', strokeWidth=1
    ).encode(
        x=alt.X('ha_x:N', title=None, axis=alt.Axis(labelAngle=-90), sort=order),
        y=alt.Y('ha_y:N', title=None, sort=order),
        color=alt.Color(
            'percent_identity:Q',
            scale=alt.Scale(scheme='blues'),
            title=['Amino Acid', 'Identity (%)']
        )
    ).properties(
        width=width,
        height=height,
        title=alt.Title(title, anchor='middle')
    )
    
    # Create heatmap for highlighted cells (drawn on top)
    heatmap_highlight = alt.Chart(df_highlight).mark_rect(
        opacity=0.95, stroke='red', strokeWidth=3
    ).encode(
        x=alt.X('ha_x:N', title=None, axis=alt.Axis(labelAngle=-90), sort=order),
        y=alt.Y('ha_y:N', title=None, sort=order),
        color=alt.Color(
            'percent_identity:Q',
            scale=alt.Scale(scheme='blues'),
            title=['Amino Acid', 'Identity (%)']
        )
    )

    # Add text labels on the heatmap
    text = alt.Chart(df_filtered).mark_text(baseline='middle').encode(
        x=alt.X('ha_x:N', sort=order),
        y=alt.Y('ha_y:N', sort=order),
        text=alt.Text('percent_identity:Q', format='.0f'),
        color=alt.condition(
            alt.datum.percent_identity > 55,
            alt.value('white'),
            alt.value('black')
        )
    )

    # Layer them: normal cells first, then highlighted cells, then text on top
    chart = (heatmap_normal + heatmap_highlight + text)
    return chart

# Calculate pairwise amino acid identity for all 18 HA subtypes

# Read the alignment
alignment = AlignIO.read('../data/ha18_alignment_revised.fasta', 'fasta')

# Extract HA subtype names from sequence IDs
# Format: A/.../H#N#
ha_subtypes = []
for record in alignment:
    # Extract H number from sequence ID (e.g., H1N1 -> H1)
    subtype = record.id.split('/')[-1].split('N')[0]
    ha_subtypes.append(subtype)

print(f"Found {len(ha_subtypes)} HA subtypes: {', '.join(ha_subtypes)}")

# Calculate pairwise identity matrix
n_seqs = len(alignment)
results = []

for i in range(n_seqs):
    for j in range(n_seqs):
        seq1 = str(alignment[i].seq)
        seq2 = str(alignment[j].seq)
        
        if i == j:
            # Self-comparison
            results.append({
                'ha_x': ha_subtypes[i],
                'ha_y': ha_subtypes[j],
                'matches': np.nan,
                'alignable_residues': np.nan,
                'percent_identity': 100.0
            })
        else:
            # Count positions where both sequences have amino acids (not gaps)
            matches = 0
            alignable = 0
            
            for aa1, aa2 in zip(seq1, seq2):
                # Skip positions where either sequence has a gap
                if aa1 != '-' and aa2 != '-':
                    alignable += 1
                    if aa1 == aa2:
                        matches += 1
            
            pct_identity = (matches / alignable * 100) if alignable > 0 else 0
            
            results.append({
                'ha_x': ha_subtypes[i],
                'ha_y': ha_subtypes[j],
                'matches': matches,
                'alignable_residues': alignable,
                'percent_identity': pct_identity
            })

ha18_identity_df = pd.DataFrame(results)

print(f"\nCalculated {len(ha18_identity_df)} pairwise comparisons")
ha18_identity_df.head()

Found 18 HA subtypes: H3, H5, H7, H1, H2, H4, H6, H8, H9, H10, H11, H12, H13, H14, H15, H16, H17, H18

Calculated 324 pairwise comparisons

seqs_of_interest = ha18_identity_df.query(
    'ha_x == "H3" & ha_y == "H5" or ha_x == "H3" & ha_y == "H7" or ha_x == "H5" & ha_y == "H7"'
)
seqs_of_interest.to_csv('../results/sequence_identity/ha_sequence_identity.csv', index=False)
seqs_of_interest

# Use hierarchical clustering to order HAs by similarity
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform

def cluster_ha(df):
    # Convert percent identity to distance matrix (100 - identity)
    identity_matrix = df.pivot(index='ha_y', columns='ha_x', values='percent_identity')
    distance_matrix = 100 - identity_matrix

    # Perform hierarchical clustering
    linkage_matrix = linkage(squareform(distance_matrix), method='average')

    # Get the order of HAs from clustering
    dendro = dendrogram(linkage_matrix, no_plot=True)
    clustered_order = [identity_matrix.index[i] for i in dendro['leaves']]
    return clustered_order

order = cluster_ha(ha18_identity_df)
plot_identity_heatmap(
    ha18_identity_df, '', width=400, height=400, order=order, 
    highlight_pairs=[('H3', 'H5'), ('H3', 'H7'), ('H5', 'H7')]
)

aln_df = pd.read_csv('../results/structural_alignment/structural_alignment.csv')[[
    'struct_site', 'h3_wt_aa', 'h5_wt_aa', 'h7_wt_aa'
]].assign(
    ha_region=lambda x: pd.Categorical(
        np.where(
            x['struct_site'].map(lambda s: natsort_keygen()(str(s))) <= natsort_keygen()('329'),
            'HA1',
            'HA2'
        ),
        categories=['HA1', 'HA2']
    )
)

aln_df.head()

# Calculate sliding window conservation across sites
def calculate_sliding_window_conservation(df, window_size=30):
    """
    Calculate sliding window conservation for pairwise HA comparisons.
    
    Parameters:
    -----------
    df : DataFrame
        Alignment dataframe with columns 'struct_site', 'h3_wt_aa', 'h5_wt_aa', 'h7_wt_aa'
    window_size : int
        Size of the sliding window
        
    Returns:
    --------
    DataFrame with columns: site, comparison, percent_conserved
    """
    # Sort by struct_site using natural sorting
    df = df.copy()
    df = df.sort_values('struct_site', key=natsort_keygen()).reset_index(drop=True)
    
    # Add a positional index for x-axis plotting
    df['site_index'] = range(len(df))
    
    # Define pairwise comparisons
    comparisons = [
        ('H3', 'H5', 'h3_wt_aa', 'h5_wt_aa'),
        ('H3', 'H7', 'h3_wt_aa', 'h7_wt_aa'),
        ('H5', 'H7', 'h5_wt_aa', 'h7_wt_aa')
    ]
    
    results = []
    
    for ha1, ha2, col1, col2 in comparisons:
        comparison_name = f'{ha1}-{ha2}'
        
        # Calculate conservation for each window
        for i in range(len(df)):
            # Define window boundaries
            start_idx = max(0, i - window_size // 2)
            end_idx = min(len(df), i + window_size // 2 + 1)
            
            # Get window data
            window_df = df.iloc[start_idx:end_idx]
            
            # Get alignable positions (both sequences have amino acids)
            alignable = window_df[[col1, col2]].dropna()
            
            if len(alignable) > 0:
                # Calculate percent conserved (matching amino acids)
                matches = (alignable[col1] == alignable[col2]).sum()
                pct_conserved = (matches / len(alignable)) * 100
            else:
                pct_conserved = np.nan
            
            results.append({
                'site': df.loc[i, 'struct_site'],
                'site_index': df.loc[i, 'site_index'],
                'comparison': comparison_name,
                'percent_conserved': pct_conserved,
                'window_size': end_idx - start_idx,
                'ha_region': df.loc[i, 'ha_region']
            })
    
    return pd.DataFrame(results)

window_df = calculate_sliding_window_conservation(aln_df, window_size=30)
window_df.head()

# Create sliding window conservation plot

# Define colors for each comparison
comparison_colors = {
    'H3-H5': '#0099B4',
    'H3-H7': '#ED0000',
    'H5-H7': '#FFDC91'
}

# Get the unique sites in natural sort order for the x-axis
site_order = window_df.drop_duplicates('site').sort_values('site_index')['site'].tolist()

# Find the HA1/HA2 boundary position
boundary_idx = window_df[window_df['ha_region'] == 'HA2']['site_index'].min()

# Create the line chart
chart = alt.Chart(window_df).mark_line(
    size=2,
    opacity=1
).encode(
    x=alt.X(
        'site:N',
        title='Site',
        sort=site_order,
        axis=alt.Axis(
            labelAngle=0,
            values=['100', '200', '300', '400', '500'],
            tickCount=5
        )
    ),
    y=alt.Y(
        'percent_conserved:Q',
        title=['Amino Acid Identity (%)'],
        scale=alt.Scale(domain=[0, 100])
    ),
    color=alt.Color(
        'comparison:N',
        title='Comparison',
        scale=alt.Scale(
            domain=list(comparison_colors.keys()),
            range=list(comparison_colors.values())
        ),
        legend=alt.Legend(
            orient='top',
            direction='horizontal',
        )
    ),
    tooltip=['window_size']
).properties(
    width=400,
    height=150,
)

# Add a vertical line to separate HA1 and HA2 using site_index
if boundary_idx is not None and not pd.isna(boundary_idx):
    boundary_site = site_order[int(boundary_idx)]
    separator = alt.Chart(pd.DataFrame({'site': [boundary_site]})).mark_rule(
        strokeDash=[5, 5],
        color='gray',
        size=1
    ).encode(
        x=alt.X('site:N', sort=site_order)
    )

    # Combine all layers
    final_chart = (chart + separator)
else:
    final_chart = chart

final_chart

# Calculate pairwise amino acid identity between HA domains

columns = ['h3_wt_aa', 'h5_wt_aa', 'h7_wt_aa']
results = []

for region in ['HA1', 'HA2']:
    # Filter to current region
    region_df = aln_df[aln_df['ha_region'] == region]
    
    # Calculate identity for all pairwise combinations in both directions
    for col1 in columns:
        for col2 in columns:
            if col1 == col2:
                # Self-comparison is 100% identity
                ha1 = col1.replace('_wt_aa', '').upper()
                ha2 = col2.replace('_wt_aa', '').upper()
                results.append({
                    'ha_region': region,
                    'ha_x': ha1,
                    'ha_y': ha2,
                    'matches': np.nan,
                    'alignable_residues': np.nan,
                    'percent_identity': 100.0
                })
            else:
                # Get positions where both sequences have amino acids (alignable residues)
                alignable = region_df[[col1, col2]].dropna()
                
                # Count matches
                matches = (alignable[col1] == alignable[col2]).sum()
                total = len(alignable)
                
                # Calculate percent identity
                pct_identity = (matches / total * 100) if total > 0 else 0
                
                ha1 = col1.replace('_wt_aa', '').upper()
                ha2 = col2.replace('_wt_aa', '').upper()
                
                results.append({
                    'ha_region': region,
                    'ha_x': ha1,
                    'ha_y': ha2,
                    'matches': matches,
                    'alignable_residues': total,
                    'percent_identity': pct_identity
                })

domain_identity_df = pd.DataFrame(results)
domain_identity_df

	ha_x	ha_y	matches	alignable_residues	percent_identity
0	H3	H3	NaN	NaN	100.000000
1	H3	H5	192.0	479.0	40.083507
2	H3	H7	229.0	483.0	47.412008
3	H3	H1	207.0	487.0	42.505133
4	H3	H2	202.0	487.0	41.478439

	site	site_index	comparison	percent_conserved	window_size	ha_region
0	9	0	H3-H5	25.000000	16	HA1
1	10	1	H3-H5	23.529412	17	HA1
2	11	2	H3-H5	27.777778	18	HA1
3	12	3	H3-H5	26.315789	19	HA1
4	13	4	H3-H5	30.000000	20	HA1

	ha_region	ha_x	ha_y	matches	alignable_residues	percent_identity
0	HA1	H3	H3	NaN	NaN	100.000000
1	HA1	H3	H5	108.0	315.0	34.285714
2	HA1	H3	H7	115.0	313.0	36.741214
3	HA1	H5	H3	108.0	315.0	34.285714
4	HA1	H5	H5	NaN	NaN	100.000000
5	HA1	H5	H7	115.0	312.0	36.858974
6	HA1	H7	H3	115.0	313.0	36.741214
7	HA1	H7	H5	115.0	312.0	36.858974
8	HA1	H7	H7	NaN	NaN	100.000000
9	HA2	H3	H3	NaN	NaN	100.000000
10	HA2	H3	H5	84.0	164.0	51.219512
11	HA2	H3	H7	114.0	170.0	67.058824
12	HA2	H5	H3	84.0	164.0	51.219512
13	HA2	H5	H5	NaN	NaN	100.000000
14	HA2	H5	H7	87.0	161.0	54.037267
15	HA2	H7	H3	114.0	170.0	67.058824
16	HA2	H7	H5	87.0	161.0	54.037267
17	HA2	H7	H7	NaN	NaN	100.000000