# Lab 3: Contextualized Embeddings
In natural language processing (NLP), words rarely exist in isolation. The meaning of a word often depends heavily on its surrounding context. Traditional word embeddings like Word2Vec and GloVe assign a single vector representation to each word, regardless of context. While useful, this approach has limitations when dealing with polysemous words (words with multiple meanings).

This lab explores **contextualized embeddings** that captures how the meaning of words shifts based on their context. Using transformer-based models like DistilBERT, we'll examine how the vector representations of words change when they appear in different contexts.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import time
import warnings
import os

# Add imports for t-SNE and UMAP
from sklearn.manifold import TSNE
import umap


from sklearn.metrics.pairwise import cosine_similarity
    
warnings.filterwarnings('ignore')

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Verify we're using CPU
device = torch.device("cpu")
print(f"Using device: {device}")

In [None]:
# Install required packages if needed
# !pip install transformers sentence-transformers numpy matplotlib scikit-learn pandas torch seaborn

## Part 1: Case Study of Polysemous Words
Our case study focuses on two words with multiple meanings: "space" and "ship". We'll examine sentences where these words are used in different contexts:
- Outer space vs. physical space
- Spaceships vs. seafaring ships

We will use a smaller version of BERT in this section

In [None]:
model_name="distilbert-base-uncased"

model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Not quite grammatically correct but still coherent sentences with "space" (outer space vs abstrct physical space)  and "ship" (spaceship vs seafaring ship) in different contexts

In [None]:
contexts = [
    "The space ship 's rockets launched into the atmosphere.",  
    "The space in the ship 's hull was flooded with water after hitting the reef.",
    "The cruise ship had plenty of space for guests on the 10th floor.",
    "Blackholes in space are dangerous for ship s that get close to the event horizon.",  
]

# Target words to extract embeddings for
target_words = ["space", "ship"]

# Extract embeddings for the target words in isolation (without context)
isolated_embeddings = {}
for word in target_words:
    # Tokenize the word
    tokens = tokenizer(word, return_tensors="pt", add_special_tokens=False)

    # Get the embedding from BERT
    with torch.no_grad():
        outputs = model(**tokens)
        word_embedding = outputs.last_hidden_state.mean(dim=1).numpy()

    isolated_embeddings[word] = word_embedding

### Extract embeddings for these words in different contexts

In [None]:
contextualized_embeddings = {}

for context in contexts:
    # Tokenize the sentence
    tokenized = tokenizer(context, return_tensors="pt")

    # Get token IDs and map them back to tokens
    tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0])

    # Get BERT embeddings for the entire sentence
    with torch.no_grad():
        outputs = model(**tokenized)
        embeddings = outputs.last_hidden_state[0].numpy()  # Remove batch dimension

    # For each target word, find its occurrence in this context and extract its embedding
    for target in target_words:
        # Find where the target word occurs in the tokenized sequence
        # Note: BERT may split words into subwords, so we need to handle that
        target_tokens = tokenizer.tokenize(target)

        # Look for the target tokens in the sentence tokens
        for i in range(len(tokens)):
            if i < len(tokens) - len(target_tokens) + 1:
                if all(tokens[i+j].replace("##", "") == token.replace("##", "") 
                       for j, token in enumerate(target_tokens)):
                    # Found the target word, extract its embedding
                    # For multi-token words, average the embeddings
                    word_embedding = embeddings[i:i+len(target_tokens)].mean(axis=0)

                    # Store the embedding with context information
                    contextualized_embeddings[f"{target} in '{context}'"] = word_embedding

### Compute similarities between all pairs of embeddings

In [None]:
all_embeddings = {**isolated_embeddings, **contextualized_embeddings}
similarity_matrix = {}

for name1, emb1 in all_embeddings.items():
    for name2, emb2 in all_embeddings.items():
        if name1 != name2:  # Skip self-comparisons
            # Calculate cosine similarity
            emb2 = np.array(emb2, ndmin=2)
            emb1 = np.array(emb1, ndmin=2)
            similarity = cosine_similarity(emb1, emb2)[0][0]
            similarity_matrix[(name1, name2)] = similarity
            

### Plot
Plot embeddings using three different dimensionality reduction techniques
- PCA
- t-SNE
- UMAP

In [None]:
selected_embeddings = all_embeddings
selected_labels = list(all_embeddings.keys())

# Project the embeddings to 2D for visualization
embeddings_array = np.vstack(list(selected_embeddings.values()))



# Function to clean labels for visualization
def clean_label(label):
    if 'blackhole' in label.lower():
        label = label[:6] + ' outer space'
    if 'atmospher' in label.lower():
        label = label[:6] + ' outer space'
    if 'cruise ship' in label.lower():
        label = label[:6] + ' inner space'
    if 'hull' in label.lower():
        label = label[:6] + ' inner space'
    return label

# Function to plot embeddings with consistent formatting
def plot_embeddings(embeddings_2d, labels, title, method_name):
    plt.figure(figsize=(12, 10))
    colors = ['red', 'blue', 'green', 'purple', 'orange', 'brown', 'pink', 'gray']
    markers = ['o', 's', '^', 'D', 'x']
    
    for i, label in enumerate(labels):
        color = colors[i % len(colors)]
        marker = markers[i % len(markers)]
        clean_lbl = clean_label(label)
        
        plt.scatter(embeddings_2d[i, 0], embeddings_2d[i, 1], 
                   color=color, s=100, marker=marker, label=clean_lbl)
        
        # Add labels to the points
        plt.annotate(clean_lbl, 
                    (embeddings_2d[i, 0], embeddings_2d[i, 1]),
                    xytext=(5, 5),
                     textcoords='offset points',
                    fontsize=10)
    
    plt.title(f'{title} ({method_name})')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# 1. PCA Visualization
pca = PCA(n_components=2)
pca_embeddings = pca.fit_transform(embeddings_array)
explained_variance = pca.explained_variance_ratio_
print(f"PCA explained variance: {explained_variance.sum()*100:.2f}%")
plot_embeddings(pca_embeddings, selected_labels, 
                'Contextualized Embeddings of "space" and "ship" in Different Contexts', 
                f'PCA - Explained Variance: {explained_variance.sum()*100:.2f}%')

# 2. t-SNE Visualization
# t-SNE is more computationally intensive but often better at preserving local structure
tsne = TSNE(n_components=2, perplexity=5, random_state=42, n_iter=1000)
tsne_embeddings = tsne.fit_transform(embeddings_array)
plot_embeddings(tsne_embeddings, selected_labels, 
                'Contextualized Embeddings of "space" and "ship" in Different Contexts', 
                't-SNE')

# 3. UMAP Visualization
# UMAP often preserves more global structure than t-SNE while still highlighting local relationships
umap_reducer = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=2, random_state=42)
umap_embeddings = umap_reducer.fit_transform(embeddings_array)
plot_embeddings(umap_embeddings, selected_labels, 
                'Contextualized Embeddings of "space" and "ship" in Different Contexts', 
                'UMAP')

# 4. Bonus: Combined visualization with subplots
plt.figure(figsize=(18, 6))

# PCA plot
plt.subplot(1, 3, 1)
for i, label in enumerate(selected_labels):
    color = colors[i % len(colors)]
    marker = markers[i % len(markers)]
    clean_lbl = clean_label(label)
    
    plt.scatter(pca_embeddings[i, 0], pca_embeddings[i, 1], 
               color=color, s=100, marker=marker)
    
plt.title(f'PCA ({explained_variance.sum()*100:.2f}%)')
plt.grid(True, linestyle='--', alpha=0.7)

# t-SNE plot
plt.subplot(1, 3, 2)
for i, label in enumerate(selected_labels):
    color = colors[i % len(colors)]
    marker = markers[i % len(markers)]
    clean_lbl = clean_label(label)
    
    plt.scatter(tsne_embeddings[i, 0], tsne_embeddings[i, 1], 
               color=color, s=100, marker=marker)
    
plt.title('t-SNE')
plt.grid(True, linestyle='--', alpha=0.7)

# UMAP plot
plt.subplot(1, 3, 3)
for i, label in enumerate(selected_labels):
    color = colors[i % len(colors)]
    marker = markers[i % len(markers)]
    clean_lbl = clean_label(label)
    
    plt.scatter(umap_embeddings[i, 0], umap_embeddings[i, 1], 
               color=color, s=100, marker=marker, label=clean_lbl)
    
plt.title('UMAP')
plt.grid(True, linestyle='--', alpha=0.7)

# Add a shared legend
handles, labels = plt.gca().get_legend_handles_labels()
plt.figlegend(handles, labels, loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.05))

plt.suptitle('Comparison of Dimensionality Reduction Techniques', fontsize=16)
plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.show()


### Exercise 1: Exploring Word Meanings with "Bank"

In this exercise, you'll explore how contextualized embeddings capture different meanings of the word "bank".

**Tasks:**
1. Use the following sentences containing the word "bank":
   - "I need to go to the bank to deposit my paycheck."
   - "The river bank was muddy after the heavy rain."
   - "The pilot had to bank the airplane to make the turn."
   - "You can bank on me to help you move this weekend."

2. Follow the same steps we used in the "space/ship" example:
   - Generate contextualized embeddings for "bank" in each sentence
   - Calculate the cosine similarity between the different embeddings
   - Create a visualization using PCA
   
3. Answer these questions:
   - Which pairs of meanings are most similar/different?
   - Does the model clearly separate the financial meaning from the other meanings?
   

### Exercise 2: Simple Sentiment Analysis
This exercise examines how context affects words with positive or negative connotations.

**Tasks:**
1. Use the word "bright" in these contexts:
   - "The future looks bright for our company."
   - "She is the brightest student in the class."
   - "The room is too bright; can you dim the lights?"
   - "His bright yellow shirt stood out in the crowd."

2. Generate contextualized embeddings for "bright" in each sentence

3. Now use the word "dark" in these contexts:
   - "The future looks dark for our company."
   - "She has a dark sense of humor."
   - "The room is too dark; can you turn on the lights?"
   - "His dark blue shirt looked professional."

4. Generate contextualized embeddings for "dark" in each sentence

5. Visualize all embeddings in a single plot

6. Answer these questions:
   - Do the embeddings cluster by sentiment (positive/negative) or by meaning (light/intelligence)?
   - How does context change the sentiment of these words?

## Part 2: Comparing Sentence Embeddings
So far, we've explored how contextual word embeddings can capture different meanings of individual words based on their surrounding context. Now we'll take a step further and examine **sentence-level embeddings** - vector representations of entire sentences.

### Word-Level vs. Sentence-Level Embeddings

There are two main approaches to generating sentence embeddings:

1. **Word-level encoders with pooling**: Using models like DistilBERT to get contextualized embeddings for each word in a sentence, then pooling these embeddings (typically by averaging) to create a single vector for the entire sentence.

2. **Dedicated sentence encoders**: Using models specifically designed to encode entire sentences directly into fixed-length vectors, such as Sentence-BERT or the SentenceTransformer models.

While both approaches generate vector representations of sentences, they differ in how they're trained and optimized. Word-level encoders with pooling leverage the contextual understanding of transformer models, while dedicated sentence encoders are specifically fine-tuned on tasks requiring sentence similarity comparisons.

### Why Compare These Approaches?

Comparing these approaches helps us understand:

- Which embedding method better captures semantic meaning at the sentence level
- How dimensionality affects the quality of embeddings (word-level embeddings tend to have higher dimensionality)
- Which approach is more effective for downstream tasks like document classification

### Experimental Setup

In this section, we'll:

1. Use a subset of the 20 Newsgroups dataset containing articles from four categories: alt.atheism, comp.graphics, rec.sport.baseball, and sci.med
2. Generate sentence embeddings using both approaches
3. Compare their effectiveness
4. Evaluate their performance on a topic classification task
5. Visualize the embedding spaces to understand their structural differences


### Load a news dataset for topic classification


In [None]:
def load_ag_news():
    from sklearn.datasets import fetch_20newsgroups
    
    categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
    newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, 
                                          remove=('headers', 'footers', 'quotes'))
    newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, 
                                         remove=('headers', 'footers', 'quotes'))
    
    # Combine data and labels
    texts_train = newsgroups_train.data
    labels_train = newsgroups_train.target
    
    texts_test = newsgroups_test.data
    labels_test = newsgroups_test.target
    
    # Limit to a smaller subset for faster processing
    max_samples = 1000  # Adjust based on computational resources
    
    if len(texts_train) > max_samples:
        indices = np.random.choice(len(texts_train), max_samples, replace=False)
        texts_train = [texts_train[i] for i in indices]
        labels_train = [labels_train[i] for i in indices]
    
    if len(texts_test) > max_samples // 4:
        indices = np.random.choice(len(texts_test), max_samples // 2, replace=False)
        texts_test = [texts_test[i] for i in indices]
        labels_test = [labels_test[i] for i in indices]
    
    # Clean the text data
    texts_train = [text[:1000].strip() for text in texts_train]  # Truncate long texts
    texts_test = [text[:1000].strip() for text in texts_test]
    
    # Map numerical labels to category names
    label_names = newsgroups_train.target_names
    
    return (texts_train, labels_train, texts_test, labels_test, label_names)

# Load the dataset
texts_train, labels_train, texts_test, labels_test, label_names = load_ag_news()

In [None]:
print(f"Training set: {len(texts_train)} samples")
print(f"Test set: {len(texts_test)} samples")
print(f"Topics: {label_names}")

### Word-level encoder with mean pooling

In [None]:
def get_word_encoder_embeddings(texts, model_name="distilbert-base-uncased"):
    """Generate embeddings using a word-level encoder with mean pooling."""
    print(f"Loading word-level encoder: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Set to evaluation mode
    model.eval()
    
    # Function for mean pooling
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]  # First element contains token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    # Process in batches to avoid memory issues
    batch_size = 32
    all_embeddings = []
    
    start_time = time.time()
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, 
                                 max_length=512, return_tensors='pt')
        
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)
        
        # Apply mean pooling
        batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        all_embeddings.append(batch_embeddings)
    
    # Concatenate all embeddings
    embeddings = torch.cat(all_embeddings).numpy()
    
    end_time = time.time()
    print(f"Generated {len(embeddings)} embeddings with dimension {embeddings.shape[1]}")
    print(f"Time taken: {end_time - start_time:.2f} seconds")
    
    return embeddings

### Now for the sentence encoder approach

In [None]:
def get_sentence_encoder_embeddings(texts, model_name="all-MiniLM-L6-v2"):
    """Generate embeddings using a dedicated sentence encoder."""
    print(f"\nLoading sentence encoder: {model_name}")
    model = SentenceTransformer(model_name)
    
    # Process texts
    start_time = time.time()
    
    # SentenceTransformer handles batching internally
    embeddings = model.encode(texts, show_progress_bar=True)
    
    end_time = time.time()
    print(f"Generated {len(embeddings)} embeddings with dimension {embeddings.shape[1]}")
    print(f"Time taken: {end_time - start_time:.2f} seconds")
    
    return embeddings

### Generate embeddings for train and test sets using both approaches

In [None]:
print("Generating word-level encoder embeddings for training set...")
word_embeddings_train = get_word_encoder_embeddings(texts_train)

print("Generating word-level encoder embeddings for test set...")
word_embeddings_test = get_word_encoder_embeddings(texts_test)

print("Generating sentence encoder embeddings for training set...")
sentence_embeddings_train = get_sentence_encoder_embeddings(texts_train)

print("Generating sentence encoder embeddings for test set...")
sentence_embeddings_test = get_sentence_encoder_embeddings(texts_test)

In [None]:
# Compare embedding dimensions
print("\nEmbedding Comparison:")
print(f"Word-level embeddings dimension: {word_embeddings_train.shape[1]}")
print(f"Sentence-level embeddings dimension: {sentence_embeddings_train.shape[1]}")


### Evaluating embedding quality via topic classification

In [None]:
# Function to evaluate embeddings using a classifier
def evaluate_embeddings(train_embeddings, train_labels, test_embeddings, test_labels, 
                        embedding_type, label_names):
    """Evaluate embeddings on topic classification task using logistic regression."""
    print(f"\n--- Evaluating {embedding_type} embeddings ---")
    
    # Train a logistic regression classifier
    classifier = LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42)
    
    start_time = time.time()
    classifier.fit(train_embeddings, train_labels)
    train_time = time.time() - start_time
    
    # Make predictions
    start_time = time.time()
    predictions = classifier.predict(test_embeddings)
    predict_time = time.time() - start_time
    
    # Calculate accuracy
    accuracy = accuracy_score(test_labels, predictions)
    
    # Classification report
    report = classification_report(test_labels, predictions, target_names=label_names)
    
    # Confusion matrix
    cm = confusion_matrix(test_labels, predictions)
    
    # Return results
    results = {
        "accuracy": accuracy,
        "report": report,
        "confusion_matrix": cm,
        "train_time": train_time,
        "predict_time": predict_time
    }
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Training time: {train_time:.2f} seconds")
    print(f"Prediction time: {predict_time:.2f} seconds")
    print("\nClassification Report:")
    print(report)
    
    return results

In [None]:
# Evaluate both embedding types
word_results = evaluate_embeddings(
    word_embeddings_train, labels_train,
    word_embeddings_test, labels_test,
    "Word-level", label_names
)

In [None]:
sentence_results = evaluate_embeddings(
    sentence_embeddings_train, labels_train,
    sentence_embeddings_test, labels_test,
    "Sentence-level", label_names
)

In [None]:
# Visualize the results
def plot_confusion_matrices(word_cm, sentence_cm, label_names):
    """Plot confusion matrices for both embedding types."""
    fig, axes = plt.subplots(1, 2, figsize=(16, 7))
    
    # Word embeddings confusion matrix
    sns.heatmap(word_cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
                yticklabels=label_names, ax=axes[0])
    axes[0].set_title('Word-level Encoder Confusion Matrix')
    axes[0].set_ylabel('True Label')
    axes[0].set_xlabel('Predicted Label')
    
    # Sentence embeddings confusion matrix
    sns.heatmap(sentence_cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
                yticklabels=label_names, ax=axes[1])
    axes[1].set_title('Sentence Encoder Confusion Matrix')
    axes[1].set_ylabel('True Label')
    axes[1].set_xlabel('Predicted Label')
    
    plt.tight_layout()
    plt.show()

# Visualize confusion matrices
plot_confusion_matrices(
    word_results["confusion_matrix"],
    sentence_results["confusion_matrix"],
    label_names
)

In [None]:
# Visualize embedding spaces with t-SNE
from sklearn.manifold import TSNE

def visualize_embeddings(embeddings, labels, label_names, title):
    """Visualize embeddings using t-SNE dimensionality reduction."""
    # Sample a subset if there are too many points
    if len(embeddings) > 500:
        indices = np.random.choice(len(embeddings), 500, replace=False)
        sample_embeddings = embeddings[indices]
        sample_labels = [labels[i] for i in indices]
    else:
        sample_embeddings = embeddings
        sample_labels = labels
    
    # Apply t-SNE
    print(f"Applying t-SNE to {title} embeddings...")
    tsne = TSNE(n_components=2, random_state=42)
    reduced_embeddings = tsne.fit_transform(sample_embeddings)
    
    # Plot the results
    plt.figure(figsize=(10, 8))
    for i, label in enumerate(np.unique(sample_labels)):
        mask = np.array(sample_labels) == label
        plt.scatter(reduced_embeddings[mask, 0], reduced_embeddings[mask, 1], 
                    label=label_names[label], alpha=0.7)
    
    plt.title(f't-SNE Visualization of {title} Embeddings')
    plt.legend()
    plt.tight_layout()
    plt.show()
    return reduced_embeddings

# Visualize both embedding types
wl = visualize_embeddings(word_embeddings_train, labels_train, label_names, "Word-level")
sl = visualize_embeddings(sentence_embeddings_train, labels_train, label_names, "Sentence-level")

In [None]:
df_texts_test = pd.DataFrame([
    texts_test,
    wl[:, 0],
    wl[:, 1],
    sl[:, 0],
    sl[:, 1],
    labels_test]).T

df_texts_test.columns = ['text', 'wl0', 'wl1', 'sl0', 'sl1', 'y']

ln = pd.DataFrame([[0, 1, 2, 3], label_names]).T
ln.columns = ['y', 'topics']

df_texts_test = df_texts_test.merge(ln, on=['y'])

In [None]:
df_texts_test[(df_texts_test.sl0.between(12, 20)) & (df_texts_test.sl1.between(-13, -5)) & (df_texts_test.topics.str.contains('athe'))].text.values

In [None]:
df_texts_test[(df_texts_test.sl0.between(12, 20)) & (df_texts_test.sl1.between(-13, -5)) & (df_texts_test.topics.str.contains('spor'))].text.values

### Cross-model embeddings analysis with topic coloring

In [None]:
def embedding_space_comparison(word_embeddings, sentence_embeddings, labels, label_names):
    """Compare the structure of the embedding spaces with topic-based coloring."""
    from scipy.stats import pearsonr
    
    print("\n--- Comparing Embedding Spaces ---")
    
    
    # Sample a subset for computational efficiency if needed
    sample_size = min(len(word_embeddings), 200)
    indices = np.random.choice(len(word_embeddings), sample_size, replace=False)
    
    word_samples = word_embeddings[indices]
    sentence_samples = sentence_embeddings[indices]
    sample_labels = [labels[i] for i in indices]
    
    # Calculate similarity matrices
    word_sim = cosine_similarity(word_samples)
    sentence_sim = cosine_similarity(sentence_samples)
    
    # Create a mapping of indices to topic labels
    idx_to_label = {i: sample_labels[i] for i in range(len(sample_labels))}
    
    # Prepare data for visualization
    comparison_data = []
    
    # Flatten the matrices and collect topic information
    n = word_sim.shape[0]
    for i in range(n):
        for j in range(i+1, n):  # Upper triangular part (excluding diagonal)
            # Get the topic pair being compared
            topic_i = label_names[idx_to_label[i]]
            topic_j = label_names[idx_to_label[j]]
            topic_pair = f"{topic_i}-{topic_j}"
            
            comparison_data.append({
                'word_sim': word_sim[i, j],
                'sentence_sim': sentence_sim[i, j],
                'topic_i': topic_i,
                'topic_j': topic_j,
                'same_topic': topic_i == topic_j
            })
    
    # Convert to DataFrame for easier handling
    import pandas as pd
    df = pd.DataFrame(comparison_data)
    
    # Calculate correlation between similarity matrices
    correlation, p_value = pearsonr(df['word_sim'], df['sentence_sim'])
    
    print(f"Correlation between similarity structures: {correlation:.4f} (p={p_value:.4e})")
    
    # Plot similarity comparison with topic coloring
    plt.figure(figsize=(12, 10))
    
    # Create a categorical color map for topic pairs
    unique_topics = label_names
    unique_topic_pairs = []
    
    # Create color map for same-topic vs. cross-topic comparisons
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Plot points colored by whether they're comparing same topic or different topics
    for same_topic in [True, False]:
        subset = df[df['same_topic'] == same_topic]
        label = 'Same Topic' if same_topic else 'Different Topics'
        color = 'darkblue' if same_topic else 'lightblue'
        alpha = 0.7 if same_topic else 0.3
        
        plt.scatter(subset['word_sim'], subset['sentence_sim'], 
                   alpha=alpha, c=color, label=label)
    
    # Create small plots for each topic-topic comparison
    from matplotlib.lines import Line2D
    
    # Add trend line
    from scipy.stats import linregress
    slope, intercept, r, p, stderr = linregress(df['word_sim'], df['sentence_sim'])
    x = np.array([min(df['word_sim']), max(df['word_sim'])])
    y = slope * x + intercept
    plt.plot(x, y, color='red', linestyle='--', label=f'Trend (r={correlation:.2f})')
    
    # Add labels and legend
    plt.xlabel('Word Embedding Cosine Similarity')
    plt.ylabel('Sentence Embedding Cosine Similarity')
    plt.title('Comparison of Similarity Structures')
    
    # Legend with two parts: same/different topics and topic pairs
    plt.legend(loc='upper left')
    
    # Add a diagonal line for reference
    plt.plot([min(df['word_sim']), max(df['word_sim'])], 
             [min(df['word_sim']), max(df['word_sim'])], 
             'k--', alpha=0.3, label='Perfect Correlation')
    
    # Create a second smaller figure for topic-specific patterns
    plt.figure(figsize=(14, 8))
    
    # Plot separate scatter plots for each topic
    for i, topic in enumerate(unique_topics):
        plt.subplot(2, len(unique_topics)//2 + len(unique_topics)%2, i+1)
        
        # Get pairs where either topic is the current one
        topic_df = df[(df['topic_i'] == topic) | (df['topic_j'] == topic)]
        
        # Color by whether both documents are from the same topic
        same = topic_df[topic_df['same_topic']]
        diff = topic_df[~topic_df['same_topic']]
        
        if len(same) > 0:
            plt.scatter(same['word_sim'], same['sentence_sim'], 
                       color='darkblue', alpha=0.7, label='Same Topic')
        
        if len(diff) > 0:
            plt.scatter(diff['word_sim'], diff['sentence_sim'],
                      color='lightblue', alpha=0.3, label='Different Topic')
        
        plt.title(f'Topic: {topic}')
        plt.xlabel('Word Sim' if i >= len(unique_topics)//2 else '')
        plt.ylabel('Sentence Sim' if i % (len(unique_topics)//2 + len(unique_topics)%2) == 0 else '')
        
        # Add trend line
        if len(topic_df) > 2:  # Need at least 3 points for a reasonable trend
            slope, intercept, r, p, stderr = linregress(topic_df['word_sim'], topic_df['sentence_sim'])
            x = np.array([min(topic_df['word_sim']), max(topic_df['word_sim'])])
            y = slope * x + intercept
            plt.plot(x, y, color='red', linestyle='--', label=f'r={r:.2f}')
        
        if i == 0:  # Only add legend to the first subplot
            plt.legend(fontsize='small')
    
    plt.tight_layout()
    plt.suptitle('Topic-Specific Similarity Comparisons', y=1.02)
    plt.show()
    
    # Analyze topic-based similarity patterns
    print("\nTopic-Specific Similarity Analysis:")
    for topic in unique_topics:
        # Get pairs where both documents are from this topic
        topic_pairs = df[(df['topic_i'] == topic) & (df['topic_j'] == topic)]
        if len(topic_pairs) > 0:
            word_mean = topic_pairs['word_sim'].mean()
            sent_mean = topic_pairs['sentence_sim'].mean()
            print(f"Topic '{topic}': Avg Word Sim = {word_mean:.3f}, Avg Sentence Sim = {sent_mean:.3f}")
    
    return correlation, p_value, df

# Compare embedding spaces with topic coloring
correlation, p_value, comparison_df = embedding_space_comparison(
    word_embeddings_train, sentence_embeddings_train, labels_train, label_names
)