# LLM-Based Qualitative Coding Example

This notebook demonstrates how to use an API to interact with language models in order to perform qualitative coding. 

## Setup and Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import requests
import os
import json
import re
from tqdm import tqdm
import time
import yaml
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)


## Load and Prepare the Data

In [None]:
# Load the dataset
mh = pd.read_csv('https://zjelveh.github.io/files/reddit_mh.csv')
mh.columns
mh['uid'] = [i for i in range(mh.shape[0])]

# Clean text data
mh['clean_text'] = mh['selftext'].fillna('').apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

# Print basic info about the dataset
print(f"Dataset shape: {mh.shape}")
print("\nSubreddit distribution:")
print(mh.subreddit.value_counts())

In [None]:
prompt_config = yaml.safe_load(open('https://zjelveh.github.io/files/prompt_config.yaml'))
themes_prompt_template = prompt_config['prompts']['themes_prompt']

## Setup API call

In [None]:
url = "https://api.groq.com/openai/v1/chat/completions"
groq_key = 'ENTER API KEY here'
API_KEY = groq_key
# model = 'deepseek-r1-distill-llama-70b'
# model = 'llama-3.2-1b-preview'
model = 'llama-3.1-8b-instant'

def query_llm(prompt, 
              model=model, 
              temperature=0.2,
              API_KEY=groq_key,
              max_tokens=8000):
    # Call the Groq API
    
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2,  # Low temperature for consistent coding
        "max_tokens": max_tokens
    }
    
    import requests
        
    response = requests.post(url, headers=headers, json=data)
    return response   

## Part 1: LLM-Based Inductive Coding

### Step 1: Sample Posts for Theme Identification

In [None]:
# Sample a subset of posts for initial theme identification
theme_sample_size = 20
theme_sample = mh.sample(n=theme_sample_size, random_state=42)

### Step 2: Identify Themes in the Sample

In [None]:
# Create batches of posts for theme identification
batch_size = 5
batches = [theme_sample[i:i+batch_size] for i in range(0, len(theme_sample), batch_size)]
len(batches)

In [None]:
# Function to process a batch and extract themes
def process_batch(batch, batch_num):
    # Prepare posts text
    posts_text = ""
    for j, (_, row) in enumerate(batch.iterrows()):
        posts_text += f"POST {j+1} (from r/{row['subreddit']}):\n"
        posts_text += f"Title: {row['title']}\n"
        posts_text += f"Content: {row['clean_text']}...\n\n"  # Truncate long posts

    # Create prompt
    
    # This updates themes_prompt_template variable and replaces the placeholders
    # for {batch_size} and {posts} with actual data
    prompt = themes_prompt_template.format(
        batch_size=len(batch),
        posts=posts_text
    )
    
    try:
        # Get response from LLM
        print(f"Processing batch {batch_num + 1}/{len(batches)}...")
        result = query_llm(prompt)
        result = result.json()
        response = result["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"Error parsing JSON from response: {e}")
        print("Response was:", response)
        return {"themes": []}    
    # Extract JSON from response
    
    try:
        # Find JSON block in response
        json_text = response.split("```json")[1].split("```")[0] if "```json" in response else response
        batch_themes = json.loads(json_text)
        return batch_themes
    except Exception as e:
        print(f"Error parsing JSON from response: {e}")
        print("Response was:", response)
        return {"themes": []}

In [None]:
all_batch_themes = []
for i, b in enumerate(batches):
    all_batch_themes.append(process_batch(b, i))

### Step 3: Consolidate Themes into a Codebook

In [None]:
consolidated_themes_prompt = prompt_config['prompts']['consolidate_prompt']

In [None]:
# Prepare batch themes text for insertion into prompt
batch_themes_text = ""

for i, batch_result in enumerate(all_batch_themes):
    batch_themes_text += f"BATCH {i+1}:\n"

    for theme in batch_result.get("themes", []):
        batch_themes_text += f"- Theme: {theme.get('name', 'Unnamed')}\n"
        batch_themes_text += f"  Description: {theme.get('description', 'No description')}\n"
        batch_themes_text += f"  Indicators: {', '.join(theme.get('indicators', []))}\n"
    batch_themes_text += "\n"
    

# Function to process a batch and extract themes
print(f"Consolidating")
result = query_llm(consolidated_themes_prompt)

result = result.json()
response = result["choices"][0]["message"]["content"]

# Extract JSON from response
try:
    # Find JSON block in response
    json_text = response.split("```json")[1].split("```")[0] if "```json" in response else response
    codebook = json.loads(json_text)
except Exception as e:
    print(f"Error parsing JSON from response: {e}")
    print("Response was:", response)


In [None]:
print("Successfully created consolidated codebook with the following codes:")
for code_entry in codebook["codebook"]:
    print(f"- {code_entry['code']}")
    
    

In [None]:
codebook

In [None]:
# Save codebook for later use
with open(f'llm_codebook_{model}', 'w') as f:
    json.dump(codebook, f, indent=2)
    

### Step 4: Apply Codes to Posts

In [None]:
# Create the prompt
deduction_prompt_template = prompt_config['prompts']['deduction_prompt']


In [None]:
# Create a formatted list of codes from the codebook
codes_list = "\n".join([
    f"{i+1}. {code['code']}: {code['description']}\n   Indicators: {', '.join(code['indicators'])}"
    for i, code in enumerate(codebook.get("codebook", []))
])

In [None]:
codes_list

In [None]:
# Run codes on all posts
coding_sample = mh.copy()

In [None]:
# Apply codes and save results
print(f"Applying codes to {coding_sample.shape[0]} posts...")
coding_results = []

In [None]:
for idx in tqdm(range(max(0, len(coding_results)), len(coding_sample))):
    row = coding_sample.iloc[idx]
    
    title = row['title']
    text = row['clean_text']
    deduction_prompt = deduction_prompt_template.format(
        codes_list=codes_list,
        post_title=title,
        post_text=text
    )
    
    # Default empty result structure
    parsed_result = {
        "assigned_codes": [],
        "post_title": title,
        "post_id": idx,
        "uid": coding_sample.index[idx]
    }
    
    try:
        # Query LLM and parse response
        response = query_llm(prompt=deduction_prompt)
        result = response.json()
        content = result["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"Error processing post {idx}: {e}")
        print(response)
        print(result)
        time.sleep(5)
        # Query LLM and parse response
        response = query_llm(prompt=deduction_prompt)
        result = response.json()
        content = result["choices"][0]["message"]["content"]
        
    try:
        json_text = (
            content.split("```json")[1].split("```")[0].strip() if "```json" in content
            else content.split("```")[1].split("```")[0].strip() if "```" in content
            else content
        )

        parsed_json = json.loads(json_text)

        parsed_result["assigned_codes"] = parsed_json["assigned_codes"]    
    except Exception as e:
        print(f"Error processing post {idx}: {e}")
    
    # Add result to collection
    coding_results.append(parsed_result)
    
    # Rate limiting
    time.sleep(0.75)
    

In [None]:
# from pandas import json_normalize

# Extract the post-level information
codes_data = []

for post_result in coding_results:
    # Extract code information for this post
    for code_info in post_result['assigned_codes']:
        code_entry = {
            'uid': post_result.get('uid'),  # Link back to original mh index
            'code': code_info.get('code'),
            'confidence': code_info.get('confidence'),
            'reasoning': code_info.get('reasoning'),
            'evidence': code_info.get('evidence')
        }
        codes_data.append(code_entry)

codes_df = pd.DataFrame(codes_data)

In [None]:
combined = mh.merge(codes_df, on=['uid'])

In [None]:
combined.head()

In [None]:
# Save for later analysis
combined.to_csv(f'llm_coded_sample_{model}.csv', index=False)