In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Set seaborn style for all plots
sns.set(style="whitegrid")

In [None]:
# Load the datasets for all models
df_70b = pd.read_csv('https://zjelveh.github.io/files/llm_coded_sample_deepseek-r1-distill-llama-70b.csv')
df_8b = pd.read_csv('https://zjelveh.github.io/files/llm_coded_sample_llama-3.1-8b-instant.csv')
df_1b = pd.read_csv('https://zjelveh.github.io/files/llm_coded_sample_llama-3.2-1b-preview.csv')


In [None]:
# Add model identifiers
df_70b['model'] = 'DeepSeek-70B'
df_8b['model'] = 'Llama-3.1-8B'
df_1b['model'] = 'Llama-3.2-1B'

In [None]:
df_8b.shape

keep_1b_codes = df_8b.code.value_counts()[(df_8b.code.value_counts()>20)].index
df_8b = df_8b[df_8b.code.isin(keep_1b_codes)]
df_8b.shape

In [None]:
# Merge datasets for comparison
all_data = pd.concat([df_70b, df_8b])

# 1. Basic Stats - Code Count by Model
code_counts = all_data.groupby(['model', 'code']).size().reset_index(name='count')

plt.figure(figsize=(14, 8))
sns.barplot(
    data=code_counts, 
    x="code", y="count", hue="model",
    palette="deep", alpha=.8
)
plt.xticks(rotation=45, ha="right")
plt.title('Code Assignment Frequency by Model')
plt.tight_layout()
# plt.savefig('code_distribution.png')

In [None]:
# 2. Compare Codes 

both = df_8b.merge(df_70b, on=['uid'], suffixes=['_llama', '_deepseek'])

crosstab = pd.crosstab(both.code_llama, both.code_deepseek, normalize='index')

# Apply styling with a colormap
styled_crosstab = crosstab.style.background_gradient(
    cmap='YlGnBu',       # Yellow-Green-Blue color palette
    axis=None,           # Apply across the entire table
    vmin=0,              # Minimum value for color scale
    vmax=crosstab.max().max(),  # Maximum value for color scale
    text_color_threshold=0.5    # Use white text for dark backgrounds
).format("{:.2f}")       # Format as 2 decimal places

# Display the styled DataFrame
styled_crosstab

crosstab = pd.crosstab(both.code_deepseek, both.code_llama, normalize='index')

# Apply styling with a colormap
styled_crosstab = crosstab.style.background_gradient(
    cmap='YlGnBu',       # Yellow-Green-Blue color palette
    axis=None,           # Apply across the entire table
    vmin=0,              # Minimum value for color scale
    vmax=crosstab.max().max(),  # Maximum value for color scale
    text_color_threshold=0.5    # Use white text for dark backgrounds
).format("{:.2f}")       # Format as 2 decimal places

# Display the styled DataFrame
styled_crosstab

In [None]:
# 3. Confidence Analysis by Code
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=all_data, 
    x="code", y="confidence", hue="model",
    palette="Set2",
    order=sorted(all_data['code'].unique())  # Explicitly set order
)
plt.title('Confidence Distribution by Code and Model')
plt.xticks(rotation=45, ha="right")
plt.tight_layout();
# plt.savefig('confidence_boxplot.png')

In [None]:
# 4. Reasoning Content Similarity - Embedding Analysis with all data
# Create embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_70b = model.encode(df_70b['reasoning'].tolist())
embeddings_1b = model.encode(df_8b['reasoning'].tolist())


# Compute cosine similarity
df_emb_70b = pd.DataFrame(embeddings_70b)
df_emb_1b = pd.DataFrame(embeddings_1b)

df_emb_70b['uid'] = df_70b.uid
df_emb_1b['uid'] = df_8b.uid

df_emb_70b = df_emb_70b.groupby(['uid']).mean().reset_index()
df_emb_1b = df_emb_1b.groupby(['uid']).mean().reset_index()

df_emb_70b = df_emb_70b[df_emb_70b.uid.isin(df_emb_1b.uid)]
df_emb_1b = df_emb_1b[df_emb_1b.uid.isin(df_emb_70b.uid)]

numerator = (df_emb_70b.iloc[:, 1:].values * df_emb_1b.iloc[:, 1:].values).sum(1)
denominator = (df_emb_70b.iloc[:, 1:]**2).values.sum(1) *(df_emb_1b.iloc[:, 1:]**2).values.sum(1)
denominator = denominator ** .5

cos_sim = pd.DataFrame({'uid' : df_emb_70b.uid,
                        'cos_sim' : numerator / denominator})


df_70b_cs = df_70b.merge(cos_sim, on=['uid'])
df_8b_cs = df_8b.merge(cos_sim, on=['uid'])

df_70b_cs.groupby(['code']).cos_sim.mean().reset_index()
df_8b_cs.groupby(['code']).cos_sim.mean().reset_index()

In [None]:
# 5 Coherence
from sklearn.metrics.pairwise import cosine_similarity

# For each code assigned by a model, measure the semantic similarity 
# between all posts assigned that code

def measure_code_coherence(df, model_name):
    """Measure how semantically similar posts within each code are"""
    codes = df['code'].unique()
    results = []
    
    for code in codes:
        # Get embeddings for all posts with this code
        code_posts = df[df['code'] == code]['reasoning'].tolist()
        if len(code_posts) < 2:
            continue
            
        # Generate embeddings
        embeddings = model.encode(code_posts)
        
        # Calculate pairwise similarities
        similarities = []
        for i in range(len(embeddings)):
            for j in range(i+1, len(embeddings)):
                sim = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
                similarities.append(sim)
        
        avg_similarity = np.mean(similarities)
        results.append({
            'model': model_name,
            'code': code,
            'coherence': avg_similarity,
            'n_posts': len(code_posts)
        })
    
    return pd.DataFrame(results)

# Compare thematic coherence within each model's codes
deepseek_coherence = measure_code_coherence(df_70b, 'DeepSeek-70B')
llama_coherence = measure_code_coherence(df_8b, 'Llama-8B')

In [None]:
deepseek_coherence.sort_values('coherence')
llama_coherence.sort_values('coherence')