import pandas as pd
import nltk
import seaborn as sns
import string
from nltk.util import ngrams
from nltk.probability import ConditionalFreqDist, ConditionalProbDist, MLEProbDist, FreqDist
import numpy as np
import matplotlib.pyplot as plt

# Download required NLTK data for tokenization
nltk.download('punkt')

# Create a translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)

[nltk_data] Downloading package punkt to /home/zjelveh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


randomized_train = pd.read_csv('https://www.dropbox.com/scl/fi/rp66akszfxugpavlh8ewy/randomized_train.csv?rlkey=ayx18lr6hdkoawksvynuydcd1&st=cq9a6rx0&dl=1')
randomized_test = pd.read_csv('https://www.dropbox.com/scl/fi/0d0kfhafi3aju5yzejek9/randomized_test.csv?rlkey=vcfvl2vmz85qcenfpx46us9c8&st=iwd2lzws&dl=1')
physics_test = pd.read_csv('https://www.dropbox.com/scl/fi/nat7w9dfkv7om2vsqbh38/physics_test.csv?rlkey=hoe1fzu2a9hr183lbu7a58zqr&st=0npcpsfv&dl=1')


def preprocess_text(text):
    # Convert to lowercase and remove short words
    text = text.lower()
    # Only keep words 3 or more characters long
    text = ' '.join([word for word in text.split() if len(word) > 2])
    # Remove punctuation
    text = text.translate(translator)
    return text


randomized_text_train = preprocess_text(randomized_train.description.str.cat(sep=' '))


randomized_text_test = preprocess_text(randomized_test.description.str.cat(sep=' '))
physics_text_test = preprocess_text(physics_test.description.str.cat(sep=' '))


rand_tokens_train = nltk.word_tokenize(randomized_text_train)
rand_tokens_test = nltk.word_tokenize(randomized_text_test)
physics_tokens_test = nltk.word_tokenize(physics_text_test)


# Let's look at a small sample of our tokens
print("\nFirst 20 tokens:")
print(rand_tokens_train[:20])

First 20 tokens:
['purpose', 'assess', 'the', 'efficacy', 'physical', 'training', 'program', 'patients', 'with', 'ankylosing', 'spondylitis', 'as', 'randomized', 'controlled', 'study', 'introduction', 'osteoporosis', 'and', 'osteopenia', 'are']


# Basic corpus statistics
print("Training corpus size:", len(rand_tokens_train), "tokens")
print("Vocabulary size:", len(set(rand_tokens_train)), "unique words")

Training corpus size: 223812 tokens
Vocabulary size: 19800 unique words


ngrams_list = list(ngrams(rand_tokens_train, 2))
ngrams_list[:5]

[('purpose', 'assess'),
 ('assess', 'the'),
 ('the', 'efficacy'),
 ('efficacy', 'physical'),
 ('physical', 'training')]


cfd = ConditionalFreqDist((' '.join(gram[:-1]), gram[-1]) for gram in ngrams_list)


# These are the words that follow 'randomzed' the number of times it happens
cfd['randomized']

FreqDist({'controlled': 594, 'clinical': 189, 'trial': 177, 'phase': 145, 'receive': 118, 'study': 113, 'into': 101, 'control': 75, 'either': 71, '11': 64, ...})


cpd = ConditionalProbDist(cfd, MLEProbDist)


# Example: P(control|randomized)
cpd['randomized'].prob('control')

0.029988004798080767


# Look at word frequencies/counts
word_counts = FreqDist(rand_tokens_train)

print("\nMost common words:")
for word, count in word_counts.most_common(10):
    print(f"{word}: {count}")

Most common words:
the: 13409
and: 8888
will: 4356
with: 3908
for: 2761
patients: 2629
randomized: 2501
study: 2499
this: 2096
treatment: 1529


# Plotting most common words using seaborn
most_common = pd.DataFrame(word_counts.most_common(10), columns=['word', 'count'])

ax = sns.barplot(data=most_common, x='word', y='count', color='blue')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right');


counts = pd.DataFrame(list(word_counts.values()), columns=['counts'])
counts['log_counts'] = np.log10(counts.counts)
sns.histplot(data=counts, x='counts', bins=50)

<AxesSubplot: xlabel='counts', ylabel='Count'>


# Same plot with counts logged for visualization purposes
sns.histplot(data=counts, x='log_counts', bins=50)

<AxesSubplot: xlabel='log_counts', ylabel='Count'>


# Recall that we've already created these objects in our lab 
# that you can use to answer these questions:
# cfd: Conditional frequency distribution
# word_counts: Frequency distribution of individual words
# ngrams_list: List of bigrams from our corpus


# 1.
# Get frequency distribution for words following "control"
control_following = cfd['control']

# Display the most common words
print("Most common words following 'control':")
for word, count in control_following.most_common(10):
    print(f"{word}: {count}")

Most common words following 'control':
group: 212
trial: 59
and: 36
arm: 28
the: 23
condition: 17
groups: 13
intervention: 8
groupthe: 7
study: 7


# 2. 
# Create a frequency distribution of all bigrams
bigram_freq = FreqDist(ngrams_list)


# bigram_freq is basically a dictiontary whether the keys are bigrams and the
# values are how often they appear
bigram_freq

FreqDist({('patients', 'with'): 770, ('this', 'study'): 686, ('randomized', 'controlled'): 594, ('the', 'study'): 537, ('will', 'randomized'): 464, ('and', 'the'): 420, ('the', 'investigators'): 402, ('controlled', 'trial'): 401, ('study', 'will'): 342, ('patients', 'will'): 332, ...})


# Count how many bigrams appear exactly once
once_count = sum(1 for bigram, count in bigram_freq.items() if count == 1)

# Calculate the percentage
total_unique_bigrams = len(bigram_freq)
percentage_once = (once_count / total_unique_bigrams) * 100

print(f"Total unique bigrams: {total_unique_bigrams}")
print(f"Bigrams appearing once: {once_count}")
print(f"Percentage of bigrams appearing only once: {percentage_once:.2f}%")

Total unique bigrams: 124248
Bigrams appearing once: 99595
Percentage of bigrams appearing only once: 80.16%


#3.
# Count unique words following "the"
the_following = cfd['the']
the_following

FreqDist({'study': 537, 'investigators': 402, 'efficacy': 250, 'intervention': 224, 'treatment': 222, 'effectiveness': 193, 'first': 191, 'primary': 191, 'effect': 187, 'purpose': 167, ...})


unique_after_the = len(the_following)
unique_after_the

3023


# Count unique words following "experiment"
experiment_following = cfd['experiment']
unique_after_experiment = len(experiment_following)
unique_after_experiment

9


def create_ngram_model(tokens, n):
    """
    Creates an n-gram language model from a sequence of tokens.
    
    The model estimates P(word_n | word_1, ..., word_{n-1}) using maximum likelihood estimation:
    P(word_n | context) = count(context, word_n) / count(context)
    
    Parameters:
        tokens: List of tokens from which to build the model
        n: Length of n-gram sequences to consider
    
    Returns:
        ConditionalProbDist object containing the probability distributions
    """
    # Create n-grams from training data
    # Each n-gram is a sequence of n consecutive tokens
    ngrams_list = list(ngrams(tokens, n))
    
    # Create conditional frequency distribution
    # For each n-gram (w1, w2, ..., wn), we use (w1, ..., w{n-1}) as the condition
    # and wn as the word we're trying to predict
    # The resulting cfd maps: context -> {word: count}
    cfd = ConditionalFreqDist(
        (' '.join(gram[:-1]), gram[-1]) 
        for gram in ngrams_list
    )
    
    # Convert raw counts to probabilities using maximum likelihood estimation
    # For each context, P(word|context) = count(context,word) / count(context)
    # MLEProbDist handles this normalization for us
    cpd = ConditionalProbDist(cfd, MLEProbDist)
    
    return cpd


cpd_2 = create_ngram_model(rand_tokens_train, 2)


def perplexity(test_tokens, cpd, n):
    """
    Calculate perplexity of a language model on test data.
    
    Perplexity is defined as: exp(-1/N * sum(log P(w_i|context_i)))
    where N is the number of words and P(w_i|context_i) is the model's
    predicted probability of word w_i given its context.
    
    Lower perplexity means the model is less "surprised" by the test data,
    indicating better predictions.
    
    Parameters:
        test_tokens: List of tokens to evaluate on
        cpd: Conditional probability distribution from training
        n: Size of n-grams used in the model
    
    Returns:
        float: Perplexity score (lower is better)
    """
    # Create n-grams from test data
    test_ngrams = list(ngrams(test_tokens, n))
    N = len(test_ngrams)
    log_prob_sum = 0
    
    # Keep track of problematic sequences for analysis
    worst_sequences = []
    
    for ngram in test_ngrams:
        # Split into context and word to predict
        context = ' '.join(ngram[:-1])  # Previous n-1 words
        word = ngram[-1]                # Word to predict
        
        try:
            # Get model's predicted probability P(word|context)
            prob = cpd[context].prob(word)
            
            if prob == 0:
                # Handle zero probabilities by assigning a small value
                # This prevents log(0) which would be undefined
                # 1e-7 is a form of smoothing to handle unseen sequences
                prob = 1e-7
                worst_sequences.append((ngram, prob))
        except:
            # Handle cases where we haven't seen this context before
            prob = 1e-7
            worst_sequences.append((ngram, prob))
            
        # Add log probability to our running sum
        # We use log probabilities to prevent numerical underflow
        # when multiplying many small probabilities
        log_prob_sum += np.log(prob)
    
    # Calculate final perplexity:
    # exp(-1/N * sum(log P(w_i|context_i)))
    return np.exp(-log_prob_sum / N)


# Let's calculate perplexity on our different datasets
print("Perplexity on training data:", perplexity(rand_tokens_train, cpd_2, 2))
print("Perplexity on test data (same domain):", perplexity(rand_tokens_test, cpd_2, 2))
print("Perplexity on physics data (different domain):", perplexity(physics_tokens_test, cpd_2, 2))

Perplexity on training data: 34.09545368814962
Perplexity on test data (same domain): 15263.17518121725
Perplexity on physics data (different domain): 61007.22594605268


# First create the model on training data
cpd_3 = create_ngram_model(rand_tokens_train, 3)


# Let's calculate perplexity on our different datasets
print("Perplexity on training data:", perplexity(rand_tokens_train, cpd_3, 3))
print("Perplexity on test data (same domain):", perplexity(rand_tokens_test, cpd_3, 3))
print("Perplexity on physics data (different domain):", perplexity(physics_tokens_test, cpd_3, 3))

Perplexity on training data: 2.67105611586623
Perplexity on test data (same domain): 420268.21799560654
Perplexity on physics data (different domain): 2051104.589677693


# Create a 4-gram language model
cpd_4 = create_ngram_model(rand_tokens_train, 4)

# Calculate perplexity on all datasets
print("Perplexity on training data:", perplexity(rand_tokens_train, cpd_4, 4))
print("Perplexity on test data (same domain):", perplexity(rand_tokens_test, cpd_4, 4))
print("Perplexity on physics data (different domain):", perplexity(physics_tokens_test, cpd_4, 4))

Perplexity on training data: 1.2732767275134642
Perplexity on test data (same domain): 1749108.1152624553
Perplexity on physics data (different domain): 6236752.116706977


from sklearn.linear_model import LogisticRegression as lr
from sklearn.feature_extraction.text import CountVectorizer


# First, let's look at our baseline from n-grams
print("N-gram probability of 'controlled' following 'randomized':")
print(cpd['randomized'].prob('controlled'))

N-gram probability of 'controlled' following 'randomized':
0.2375049980007997


# Create training data
bigrams_for_logistic = [bigram[0] for bigram in ngrams_list]  # First word of each bigram
y = [1 if bigram[1]=='controlled' else 0 for bigram in ngrams_list]  # Is second word 'controlled'?


# Convert words to features using bag-of-words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(bigrams_for_logistic)


print("\nFeature matrix shape:", X.shape)
print("Number of unique words (features):", len(vectorizer.get_feature_names_out()))

Feature matrix shape: (223811, 19692)
Number of unique words (features): 19692


# Train logistic regression model
clf = lr(n_jobs=10)
clf.fit(X, y)

LogisticRegression(n_jobs=10)

LogisticRegression(n_jobs=10)


# Let's examine predictions for specific words
test_words = ["randomized", "random", "the", "study", "patients"]
X_test = vectorizer.transform(test_words)
predictions = clf.predict_proba(X_test)[:, 1]
predictions

array([2.34965865e-01, 5.38680605e-04, 1.15498180e-04, 6.99539927e-04,
       2.68306457e-04])


print("\nProbability of 'controlled' following:")
for word, prob in zip(test_words, predictions):
    
    print(f"From logistic regression model: {word}: {prob:.3f}")
    print(f"From bigram language model: {word}: {cpd[word].prob('controlled'):.3f}")
    print('===========')

Probability of 'controlled' following:
From logistic regression model: randomized: 0.235
From bigram language model: randomized: 0.238
===========
From logistic regression model: random: 0.001
From bigram language model: random: 0.000
===========
From logistic regression model: the: 0.000
From bigram language model: the: 0.000
===========
From logistic regression model: study: 0.001
From bigram language model: study: 0.001
===========
From logistic regression model: patients: 0.000
From bigram language model: patients: 0.000
===========


# Filter to more common words to make computation manageable
min_freq = 500
filtered_tokens = [t for t in rand_tokens_train if word_counts[t] >= min_freq]
print(f"\nVocabulary size after filtering (freq >= {min_freq}):", 
      len(set(filtered_tokens)))

Vocabulary size after filtering (freq >= 500): 30


# Create training data for multinomial model
bigrams_filtered = list(ngrams(filtered_tokens, 2))
X_words = [bigram[0] for bigram in bigrams_filtered]
y_next = [bigram[1] for bigram in bigrams_filtered]


y[:20] # outcome for logistic regression where 1 means controlled follows randomized

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


y_next[:20] # outcome for multinomial logistic regression

['patients',
 'with',
 'randomized',
 'controlled',
 'study',
 'and',
 'are',
 'and',
 'are',
 'for',
 'and',
 'the',
 'the',
 'with',
 'are',
 'and',
 'the',
 'and',
 'and',
 'the']


# Convert to features
vectorizer_multi = CountVectorizer()
X_filtered = vectorizer_multi.fit_transform(X_words)


# Train multinomial model
clf_multi = lr(multi_class='multinomial', n_jobs=10, max_iter=500)
clf_multi.fit(X_filtered, y_next)

LogisticRegression(max_iter=500, multi_class='multinomial', n_jobs=10)

LogisticRegression(max_iter=500, multi_class='multinomial', n_jobs=10)


def compare_distributions(word, cpd, multi_model, vectorizer, top_k=10):
    """
    Compares how N-gram and multinomial logistic regression models predict
    the next word distribution.
    
    This function visualizes the differences between:
    1. Simple counting-based N-gram probabilities
    2. Learned probabilities from the multinomial model
    
    The comparison helps us understand how moving beyond simple counting
    affects our probability estimates.
    
    Parameters:
        word: String - The context word whose following distribution we want to examine
        cpd: ConditionalProbDist - N-gram model containing count-based probabilities
        multi_model: LogisticRegression - Trained multinomial model
        vectorizer: CountVectorizer - Fitted vectorizer for processing input words
        top_k: int - Number of top predictions to compare (default: 10)
    
    Returns:
        None - Displays a plot comparing the distributions
    """
    try:
        # First, get the N-gram based probability distribution
        # cpd[word] gives us the probability distribution for words following our target word
        # .samples() returns all words that ever followed our target word in training
        vocab = cpd[word].samples()
        
        # Calculate probability for each possible next word according to N-gram model
        # This is based on simple counting: P(next|word) = count(word,next)/count(word)
        ngram_probs = [cpd[word].prob(w) for w in vocab]
        
        # Now get predictions from our multinomial model
        # First transform our word into the feature representation expected by the model
        # This creates a sparse vector where each position represents a word in our vocabulary
        X_pred = vectorizer.transform([word])
        
        # Get probability distribution over all possible next words
        # The model uses learned weights to convert word features into probabilities
        # predict_proba returns a matrix; we want the first (only) row
        model_probs = multi_model.predict_proba(X_pred)[0]
        
        # Sort both distributions by probability and take top k
        # zip combines words with their probabilities
        # sorted with reverse=True gives highest probabilities first
        ngram_top = sorted(zip(vocab, ngram_probs), 
                          key=lambda x: x[1], reverse=True)[:top_k]
        model_top = sorted(zip(multi_model.classes_, model_probs), 
                          key=lambda x: x[1], reverse=True)[:top_k]
        
        # Convert to DataFrames for plotting
        # We create separate DataFrames for each model and mark their source
        # This lets us plot them side by side for comparison
        ngram_df = pd.DataFrame({
            'word': [w for w, _ in ngram_top],
            'probability': [p for _, p in ngram_top],
            'model': ['N-gram'] * top_k  # Label these points as coming from N-gram model
        })
        
        model_df = pd.DataFrame({
            'word': [w for w, _ in model_top],
            'probability': [p for _, p in model_top],
            'model': ['Multinomial'] * top_k  # Label these points as coming from multinomial model
        })
        
        # Combine into single DataFrame for plotting
        # concat vertically stacks the DataFrames, keeping all columns
        plot_df = pd.concat([ngram_df, model_df])
        
        # Create a grouped bar plot comparing the distributions
        # figsize=(15, 6) makes the plot wide enough to read all labels
        plt.figure(figsize=(15, 6))
        g = sns.barplot(data=plot_df, x='word', y='probability', hue='model')
        
        # Rotate labels for readability
        # ha='right' aligns the rotated labels with their bars
        g.set_xticklabels(g.get_xticklabels(), rotation=45, ha='right')
        
        plt.title(f'Top {top_k} Words Following "{word}"')
        plt.tight_layout()  # Adjust spacing to prevent label cutoff
    
    except Exception as e:
        # If anything goes wrong (e.g., word not in vocabulary),
        # print informative error message
        print(f"'{word}' not found or other error occurred: {str(e)}")


# Compare distributions for interesting words
for word in ['randomized', 'study', 'patients']:
    compare_distributions(word, cpd, clf_multi, vectorizer_multi)


# Hint, you want something like this to create the ngram model
# cpd_filtered_2 = create_ngram_model(xxx, xxx)
# just fillin the xxxs


# Create a filtered n-gram language model using the same tokens as the multinomial model
cpd_filtered_2 = create_ngram_model(filtered_tokens, 2)

# Compare distributions for interesting words with the filtered model
for word in ['randomized', 'study', 'patients']:
    compare_distributions(word, cpd_filtered_2, clf_multi, vectorizer_multi)

N-gram Language Models: Understanding Context in Language¶

Understanding Language Models: From N-grams to Transformers¶

Why Start with N-grams?¶

Setup¶

Load our datasets¶

Understanding N-grams¶

Preprocess training data.¶

Now, let's create our language model with specified n-gram size¶

Understanding Word Distributions¶

Exercise 1: Exploring Context¶

Measuring Language Model Quality with Perplexity¶

Evaluating Language Models with Perplexity¶

Exercise 2: Perplexity and sparsity¶

From Counting to Learning: Moving Beyond N-grams¶

The Limitations of Counting¶

Why Machine Learning Helps¶

A Simple First Step: Logistic Regression¶

Multinomial regression¶

Exercise 3: Standardizing the vocabulary¶