In [None]:
import pandas as pd
import json
from collections import defaultdict
import numpy as np
from plotnine import * # Import plotnine
import warnings

# --- Configuration ---
INPUT_CODING_RESULTS_FILE = "deductive_coding_results_multi_model.json"
# Define a threshold for considering a prediction valid (optional)
# For now, we count any identified topic/subtopic regardless of confidence
# CONFIDENCE_THRESHOLD = 3

# --- Helper Functions ---

def calculate_metrics(tp, fp, fn):
    """Calculates Precision, Recall, and F1-Score."""
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1

def safe_get(data, keys, default=None):
    """Safely get nested dictionary values."""
    if not isinstance(data, dict):
        return default
    temp = data
    for key in keys:
        if isinstance(temp, dict) and key in temp:
            temp = temp[key]
        else:
            return default
    return temp

results_file = INPUT_CODING_RESULTS_FILE

with open(results_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)
print("Converted results to DataFrame.")

# --- Data Cleaning and Preparation ---

# Filter out entries with coding errors
initial_count = len(df)
# df = df[df['error'].isna()] # Assumes 'error' key exists for errors
df = df[df['coding_result'].apply(lambda x: isinstance(x, dict) and 'error' not in x)]
print(f"Filtered out {initial_count - len(df)} entries with coding errors.")

if df.empty:
    print("No valid coding results found after filtering errors.")


# Extract ground truth information safely
df['gt_topics'] = df['ground_truth'].apply(lambda x: set(k for k, v in safe_get(x, ['topic_mix'], {}).items() if v is not None))
df['gt_subtopics'] = df['ground_truth'].apply(lambda x: set(v for k, v in safe_get(x, ['selected_subtopics'], {}).items() if v is not None))

# Extract predicted topics and subtopics safely
df['pred_topics'] = df['coding_result'].apply(lambda x: set(t['topic_id'] for t in safe_get(x, ['topics'], []) if isinstance(t, dict) and 'topic_id' in t))
# Extract predicted subtopic names
df['pred_subtopics'] = df['coding_result'].apply(lambda x: set(s['subtopic_name'] for s in safe_get(x, ['subtopics'], []) if isinstance(s, dict) and 'subtopic_name' in s))

# Extract grouping parameters from ground_truth safely
df['domain'] = df['ground_truth'].apply(lambda x: safe_get(x, ['domain'], 'N/A'))
df['diversity_type'] = df['ground_truth'].apply(lambda x: safe_get(x, ['diversity_type'], 'N/A'))
df['allow_topic_mention'] = df['ground_truth'].apply(lambda x: safe_get(x, ['allow_topic_mention'], 'N/A'))
df['allow_subtopic_mention'] = df['ground_truth'].apply(lambda x: safe_get(x, ['allow_subtopic_mention'], 'N/A'))

# Extract individual diversity parameters (handle missing/None)
diversity_param_keys = [
    'concept_blending', 'concept_granularity', 'interdisciplinary_orientation',
    'methodological_approaches', 'rhetorical_structures', 'temporal_context',
    'terminology_density'
]

for key in diversity_param_keys:
    df[key] = df['ground_truth'].apply(lambda x: safe_get(x, ['diversity_params', key], 'N/A'))
    df[key] = df[key].fillna('N/A') # Replace None/NaN with 'N/A' for grouping

print("Extracted ground truth, predictions, and grouping parameters.")

# --- Calculate TP, FP, FN per row ---
print("Calculating TP, FP, FN for each entry...")

# Topics
df['tp_topics'] = df.apply(lambda row: len(row['gt_topics'].intersection(row['pred_topics'])), axis=1)
df['fp_topics'] = df.apply(lambda row: len(row['pred_topics'].difference(row['gt_topics'])), axis=1)
df['fn_topics'] = df.apply(lambda row: len(row['gt_topics'].difference(row['pred_topics'])), axis=1)

# Subtopics
df['tp_subtopics'] = df.apply(lambda row: len(row['gt_subtopics'].intersection(row['pred_subtopics'])), axis=1)
df['fp_subtopics'] = df.apply(lambda row: len(row['pred_subtopics'].difference(row['gt_subtopics'])), axis=1)
df['fn_subtopics'] = df.apply(lambda row: len(row['gt_subtopics'].difference(row['pred_subtopics'])), axis=1)

print("Calculations complete.")

# --- Grouping and Aggregation ---
grouping_vars = [
    'domain',
    'diversity_type',
    'generating_model',
    'coding_model',
    'allow_topic_mention',
    'allow_subtopic_mention'
] + diversity_param_keys

results_summary = {}

for group_var in grouping_vars:
    print(f"\n--- Analyzing by: {group_var} ---")

    if group_var not in df.columns:
        print(f"Warning: Grouping variable '{group_var}' not found in DataFrame. Skipping.")
        continue

    # Group data
    grouped = df.groupby(group_var).agg(
        total_entries=('original_variation_id', 'count'), # Count entries per group
        # Sum TP/FP/FN across the group
        sum_tp_topics=('tp_topics', 'sum'),
        sum_fp_topics=('fp_topics', 'sum'),
        sum_fn_topics=('fn_topics', 'sum'),
        sum_tp_subtopics=('tp_subtopics', 'sum'),
        sum_fp_subtopics=('fp_subtopics', 'sum'),
        sum_fn_subtopics=('fn_subtopics', 'sum')
    )

    # Calculate metrics for each group
    results_list = []
    for group_name, row in grouped.iterrows():
        # Topic Metrics
        prec_t, recall_t, f1_t = calculate_metrics(row['sum_tp_topics'], row['sum_fp_topics'], row['sum_fn_topics'])
        # Subtopic Metrics
        prec_s, recall_s, f1_s = calculate_metrics(row['sum_tp_subtopics'], row['sum_fp_subtopics'], row['sum_fn_subtopics'])

        results_list.append({
            group_var: group_name,
            'Total Entries': row['total_entries'],
            'TP (Topics)': row['sum_tp_topics'],
            'FP (Topics)': row['sum_fp_topics'],
            'FN (Topics)': row['sum_fn_topics'],
            'Precision (Topics)': prec_t,
            'Recall (Topics)': recall_t,
            'F1 (Topics)': f1_t,
            'TP (Subtopics)': row['sum_tp_subtopics'],
            'FP (Subtopics)': row['sum_fp_subtopics'],
            'FN (Subtopics)': row['sum_fn_subtopics'],
            'Precision (Subtopics)': prec_s,
            'Recall (Subtopics)': recall_s,
            'F1 (Subtopics)': f1_s,
        })

    # Convert results for this grouping to DataFrame for better display
    results_df = pd.DataFrame(results_list)
    results_summary[group_var] = results_df

    # Print the results for the current grouping
    print(results_df.round(3).to_string(index=False)) # Display rounded results
    



def plot_evaluation_metrics(results_summary):
    """
    Generates bar plots for Precision, Recall, F1 using plotnine.

    Args:
        results_summary (dict): Dictionary where keys are grouping variable names
                                and values are the summary DataFrames from analyze_coding_results.
    """
    print("\n--- Generating Evaluation Plots ---")

    if not results_summary:
        print("No summary data to plot.")
        return

    # Define consistent colors for metrics
    metric_colors = {'Precision': '#1f77b4', 'Recall': '#ff7f0e', 'F1': '#2ca02c'} # Example colors

    with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=UserWarning) # Suppress plotnine warnings if needed

         for group_var, summary_df in results_summary.items():
             print(f"\nPlotting metrics for grouping: {group_var}")

             if summary_df.empty or len(summary_df) == 0:
                 print(f"Skipping empty or invalid summary for {group_var}")
                 continue

             # Ensure the grouping variable column exists after potential reset_index
             if group_var not in summary_df.columns:
                  print(f"Warning: Column '{group_var}' not found in summary DataFrame for plotting. Skipping.")
                  continue

             # Convert grouping variable to string/category for plotting stability
             try:
                 summary_df[group_var] = summary_df[group_var].astype(str)
             except Exception as e:
                  print(f"Warning: Could not convert group var '{group_var}' to string: {e}. Plotting might fail.")


             # --- Prepare data for Topic plots ---
             topic_cols = ['Precision (Topics)', 'Recall (Topics)', 'F1 (Topics)']
             if not all(col in summary_df.columns for col in topic_cols):
                  print(f"Warning: Missing topic metric columns for {group_var}. Skipping topic plot.")
             else:
                  topic_metrics_df = summary_df[[group_var] + topic_cols].copy()
                  topic_metrics_df.rename(columns={
                      'Precision (Topics)': 'Precision',
                      'Recall (Topics)': 'Recall',
                      'F1 (Topics)': 'F1'
                  }, inplace=True)
                  try:
                       topic_melted = pd.melt(topic_metrics_df,
                                                id_vars=[group_var],
                                                var_name='Metric Type',
                                                value_name='Score')
                       topic_melted['Metric Type'] = pd.Categorical(topic_melted['Metric Type'], categories=['Precision', 'Recall', 'F1']) # Ensure order
                  except Exception as melt_e:
                       print(f"ERROR during topic data melting for {group_var}: {melt_e}")
                       continue # Skip plotting for this variable


                  # --- Generate Topic Plot ---
                  try:
                      topic_plot = (
                          ggplot(topic_melted, aes(x='Metric Type', y='Score', fill=group_var))
                          + geom_col(position=position_dodge(width=0.9), na_rm=True) # Dodge bars, remove NA scores
                          + labs(title=f'Topic Identification Metrics by {group_var.replace("_", " ").title()}',
                                 x=group_var.replace('_', ' ').title(), # Nicer axis label
                                 y='Score (0-1)',
                                 fill='Metric') # Legend title
                          + ylim(0, 1.05) # Extend slightly beyond 1 for visibility
                          + theme_minimal(base_size=10) # Adjust base font size if needed
                          + theme(axis_text_x=element_text(angle=45, hjust=1, size=8), # Rotate labels, adjust size
                                  plot_title=element_text(size=12),
                                  figure_size=(max(6, len(topic_melted[group_var].unique()) * 0.8), 4)) # Dynamic width
                          # + scale_fill_manual(values=metric_colors) + # Use defined colors
                          + theme(plot_background=element_rect(fill='white'), panel_background=element_rect(fill='white'))
                      )
                      print(topic_plot) # Display the plot
                  except Exception as e:
                      print(f"ERROR generating topic plot for {group_var}: {e}")


             # --- Prepare data for Subtopic plots ---
             subtopic_cols = ['Precision (Subtopics)', 'Recall (Subtopics)', 'F1 (Subtopics)']
             if not all(col in summary_df.columns for col in subtopic_cols):
                  print(f"Warning: Missing subtopic metric columns for {group_var}. Skipping subtopic plot.")
             else:
                  subtopic_metrics_df = summary_df[[group_var] + subtopic_cols].copy()
                  subtopic_metrics_df.rename(columns={
                      'Precision (Subtopics)': 'Precision',
                      'Recall (Subtopics)': 'Recall',
                      'F1 (Subtopics)': 'F1'
                  }, inplace=True)
                  try:
                       subtopic_melted = pd.melt(subtopic_metrics_df,
                                                   id_vars=[group_var],
                                                   var_name='Metric Type',
                                                   value_name='Score')
                       subtopic_melted['Metric Type'] = pd.Categorical(subtopic_melted['Metric Type'], categories=['Precision', 'Recall', 'F1']) # Ensure order
                  except Exception as melt_e:
                       print(f"ERROR during subtopic data melting for {group_var}: {melt_e}")
                       continue # Skip plotting for this variable


                  # --- Generate Subtopic Plot ---
                  try:
                      subtopic_plot = (
                          ggplot(subtopic_melted, aes(x='Metric Type', y='Score', fill=group_var))
                          + geom_col(position=position_dodge(width=0.9), na_rm=True) # Dodge bars, remove NA scores
                          + labs(title=f'Subtopic Identification Metrics by {group_var.replace("_", " ").title()}',
                                 x=group_var.replace('_', ' ').title(), # Nicer axis label
                                 y='Score (0-1)',
                                 fill='Metric') # Legend title
                          + ylim(0, 1.05) # Extend slightly beyond 1
                          + theme_minimal(base_size=10)
                          + theme(axis_text_x=element_text(angle=45, hjust=1, size=8), # Rotate labels, adjust size
                                  plot_title=element_text(size=12),
                                  figure_size=(max(6, len(subtopic_melted[group_var].unique()) * 0.8), 4)) # Dynamic width
                          #+ scale_fill_manual(values=metric_colors) + # Use defined colors
                          + theme(plot_background=element_rect(fill='white'), panel_background=element_rect(fill='white'))
                      )
                      print(subtopic_plot) # Display the plot
                  except Exception as e:
                      print(f"ERROR generating subtopic plot for {group_var}: {e}")

    print("\n--- Plotting Complete ---")


plot_evaluation_metrics(results_summary)

import pandas as pd
import json
from collections import defaultdict
import numpy as np
from plotnine import * # Import plotnine
import warnings
import re # Import regex for parsing keys later

# --- Configuration ---
INPUT_CODING_RESULTS_FILE = "deductive_coding_results_multi_model.json"
# Define a threshold for considering a prediction valid (optional)
# CONFIDENCE_THRESHOLD = 3

# --- Helper Functions ---

def calculate_metrics(tp, fp, fn):
    """Calculates Precision, Recall, and F1-Score."""
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    # Handle potential NaN if precision and recall are both 0
    if np.isnan(f1):
        f1 = 0
    return precision, recall, f1

def safe_get(data, keys, default=None):
    """Safely get nested dictionary values."""
    if not isinstance(data, dict):
        return default
    temp = data
    for key in keys:
        if isinstance(temp, dict) and key in temp:
            temp = temp[key]
        else:
            return default
    return temp

def analyze_coding_results(results_file):
    """
    Loads coding results, calculates TP/FP/FN, aggregates metrics by single
    variables and nested variables (by coding/generating model).
    Returns a dictionary of summary DataFrames.
    """
    print(f"Loading coding results from: {results_file}")
    try:
        with open(results_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Loaded {len(data)} entries.")
    except FileNotFoundError:
        print(f"ERROR: File not found at {results_file}")
        return None
    except json.JSONDecodeError as e:
        print(f"ERROR: Failed to decode JSON from {results_file}: {e}")
        return None
    except Exception as e:
        print(f"ERROR: An unexpected error occurred loading the file: {e}")
        return None

    # Convert to DataFrame
    df = pd.DataFrame(data)
    print("Converted results to DataFrame.")

    # --- Data Cleaning and Preparation ---
    initial_count = len(df)
    df = df[df['error'].isna()] if 'error' in df.columns else df
    df = df[df['coding_result'].apply(lambda x: isinstance(x, dict) and 'error' not in x)]
    df = df[df['coding_result'].apply(lambda x: isinstance(x, dict))]
    print(f"Filtered out {initial_count - len(df)} entries with coding errors or invalid format.")

    if df.empty:
        print("No valid coding results found after filtering errors.")
        return None

    # Extract ground truth sets
    df['gt_topics'] = df['ground_truth'].apply(lambda x: set(k for k, v in safe_get(x, ['topic_mix'], {}).items() if v is not None))
    df['gt_subtopics'] = df['ground_truth'].apply(lambda x: set(v for k, v in safe_get(x, ['selected_subtopics'], {}).items() if v is not None))

    # Extract predicted sets
    df['pred_topics'] = df['coding_result'].apply(
        lambda x: set(t['topic_id'] for t in safe_get(x, ['topics'], []) if isinstance(t, dict) and 'topic_id' in t)
                  if isinstance(safe_get(x, ['topics']), list) else set()
    )
    df['pred_subtopics'] = df['coding_result'].apply(
        lambda x: set(s['subtopic_name'] for s in safe_get(x, ['subtopics'], []) if isinstance(s, dict) and 'subtopic_name' in s)
                  if isinstance(safe_get(x, ['subtopics']), list) else set()
    )

    # Extract grouping parameters safely
    df['domain'] = df['ground_truth'].apply(lambda x: safe_get(x, ['domain'], 'N/A')).fillna('N/A')
    df['diversity_type'] = df['ground_truth'].apply(lambda x: safe_get(x, ['diversity_type'], 'N/A')).fillna('N/A')
    df['allow_topic_mention'] = df['ground_truth'].apply(lambda x: safe_get(x, ['allow_topic_mention'], 'N/A')).fillna('N/A').astype(str) # Convert bool to str
    df['allow_subtopic_mention'] = df['ground_truth'].apply(lambda x: safe_get(x, ['allow_subtopic_mention'], 'N/A')).fillna('N/A').astype(str) # Convert bool to str

    df['generating_model'] = df['generating_model'].fillna('N/A') if 'generating_model' in df.columns else 'N/A'
    df['coding_model'] = df['coding_model'].fillna('N/A') if 'coding_model' in df.columns else 'N/A'

    diversity_param_keys = [
        'concept_blending', 'concept_granularity', 'interdisciplinary_orientation',
        'methodological_approaches', 'rhetorical_structures', 'temporal_context',
        'terminology_density'
    ]
    for key in diversity_param_keys:
        df[key] = df['ground_truth'].apply(lambda x: safe_get(x, ['diversity_params', key], 'N/A'))
        df[key] = df[key].fillna('N/A').astype(str)

    print("Extracted ground truth, predictions, and grouping parameters.")

    # --- Calculate TP, FP, FN per row ---
    df['tp_topics'] = df.apply(lambda row: len(row['gt_topics'].intersection(row['pred_topics'])), axis=1)
    df['fp_topics'] = df.apply(lambda row: len(row['pred_topics'].difference(row['gt_topics'])), axis=1)
    df['fn_topics'] = df.apply(lambda row: len(row['gt_topics'].difference(row['pred_topics'])), axis=1)
    df['tp_subtopics'] = df.apply(lambda row: len(row['gt_subtopics'].intersection(row['pred_subtopics'])), axis=1)
    df['fp_subtopics'] = df.apply(lambda row: len(row['pred_subtopics'].difference(row['gt_subtopics'])), axis=1)
    df['fn_subtopics'] = df.apply(lambda row: len(row['gt_subtopics'].difference(row['pred_subtopics'])), axis=1)
    print("Calculated TP, FP, FN.")

    # --- Aggregation Function ---
    def aggregate_and_calculate(grouped_df):
        """Helper to aggregate TP/FP/FN and calculate metrics."""
        agg_results = grouped_df.agg(
            total_entries=('original_variation_id', 'count'),
            sum_tp_topics=('tp_topics', 'sum'),
            sum_fp_topics=('fp_topics', 'sum'),
            sum_fn_topics=('fn_topics', 'sum'),
            sum_tp_subtopics=('tp_subtopics', 'sum'),
            sum_fp_subtopics=('fp_subtopics', 'sum'),
            sum_fn_subtopics=('fn_subtopics', 'sum')
        ).reset_index()

        # Calculate metrics row-wise on aggregated results
        metrics = agg_results.apply(
            lambda row: pd.Series({
                'Precision (Topics)': calculate_metrics(row['sum_tp_topics'], row['sum_fp_topics'], row['sum_fn_topics'])[0],
                'Recall (Topics)': calculate_metrics(row['sum_tp_topics'], row['sum_fp_topics'], row['sum_fn_topics'])[1],
                'F1 (Topics)': calculate_metrics(row['sum_tp_topics'], row['sum_fp_topics'], row['sum_fn_topics'])[2],
                'Precision (Subtopics)': calculate_metrics(row['sum_tp_subtopics'], row['sum_fp_subtopics'], row['sum_fn_subtopics'])[0],
                'Recall (Subtopics)': calculate_metrics(row['sum_tp_subtopics'], row['sum_fp_subtopics'], row['sum_fn_subtopics'])[1],
                'F1 (Subtopics)': calculate_metrics(row['sum_tp_subtopics'], row['sum_fp_subtopics'], row['sum_fn_subtopics'])[2],
            }), axis=1
        )
        # Combine aggregated counts and calculated metrics
        return pd.concat([agg_results, metrics], axis=1)


    # --- Perform Grouping and Aggregation ---
    results_summary = {}
    base_grouping_vars = [
        'domain', 'diversity_type', 'generating_model', 'coding_model',
        'allow_topic_mention', 'allow_subtopic_mention'
    ]
    valid_diversity_keys = [key for key in diversity_param_keys if key in df.columns]
    all_grouping_vars = base_grouping_vars + valid_diversity_keys

    # 1. Single Variable Grouping (as before)
    print("\n--- Performing Single Variable Grouping Analysis ---")
    for group_var in all_grouping_vars:
        print(f"  Grouping by: {group_var}")
        if group_var not in df.columns or df[group_var].isnull().all():
            print(f"    Skipping '{group_var}' due to missing column or all null values.")
            continue
        try:
            grouped_single = df.groupby(group_var, observed=False, dropna=False) # Include NA group if present
            results_df = aggregate_and_calculate(grouped_single)
            results_summary[group_var] = results_df
            print(f"    Aggregated results for {group_var}.")
            # print(results_df.round(3).to_string(index=False)) # Optional: print table here
        except Exception as e:
            print(f"    ERROR during single grouping for '{group_var}': {e}")

    # 2. Nested Grouping: By Coding Model, then others
    print("\n--- Performing Nested Grouping Analysis (by Coding Model first) ---")
    primary_nest_var1 = 'coding_model'
    secondary_vars1 = [v for v in all_grouping_vars if v != primary_nest_var1]
    for secondary_var in secondary_vars1:
        group_key = f"by_{primary_nest_var1}_then_{secondary_var}"
        print(f"  Grouping by: {primary_nest_var1}, {secondary_var}")
        if primary_nest_var1 not in df.columns or secondary_var not in df.columns or \
           df[primary_nest_var1].isnull().all() or df[secondary_var].isnull().all():
            print(f"    Skipping '{group_key}' due to missing columns or all null values.")
            continue
        try:
            grouped_nested1 = df.groupby([primary_nest_var1, secondary_var], observed=False, dropna=False)
            results_df_nested1 = aggregate_and_calculate(grouped_nested1)
            results_summary[group_key] = results_df_nested1
            print(f"    Aggregated results for {group_key}.")
            # print(results_df_nested1.round(3).to_string(index=False)) # Optional: print table here
        except Exception as e:
             print(f"    ERROR during nested grouping for '{group_key}': {e}")

    # 3. Nested Grouping: By Generating Model, then others
    print("\n--- Performing Nested Grouping Analysis (by Generating Model first) ---")
    primary_nest_var2 = 'generating_model'
    secondary_vars2 = [v for v in all_grouping_vars if v != primary_nest_var2]
    for secondary_var in secondary_vars2:
        group_key = f"by_{primary_nest_var2}_then_{secondary_var}"
        print(f"  Grouping by: {primary_nest_var2}, {secondary_var}")
        if primary_nest_var2 not in df.columns or secondary_var not in df.columns or \
           df[primary_nest_var2].isnull().all() or df[secondary_var].isnull().all():
            print(f"    Skipping '{group_key}' due to missing columns or all null values.")
            continue
        try:
            grouped_nested2 = df.groupby([primary_nest_var2, secondary_var], observed=False, dropna=False)
            results_df_nested2 = aggregate_and_calculate(grouped_nested2)
            results_summary[group_key] = results_df_nested2
            print(f"    Aggregated results for {group_key}.")
            # print(results_df_nested2.round(3).to_string(index=False)) # Optional: print table here
        except Exception as e:
             print(f"    ERROR during nested grouping for '{group_key}': {e}")


    print("\n--- Analysis Complete ---")
    return results_summary

def plot_evaluation_metrics(results_summary):
    """
    Generates bar plots for Precision, Recall, F1 using plotnine,
    handling both single and nested grouping results (using facets for nested).
    """
    print("\n--- Generating Evaluation Plots ---")

    if not results_summary:
        print("No summary data to plot.")
        return

    metric_colors = {'Precision': '#1f77b4', 'Recall': '#ff7f0e', 'F1': '#2ca02c'}

    with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=UserWarning) # Suppress plotnine warnings

         for key, summary_df in results_summary.items():
             print(f"\nPlotting metrics for grouping key: {key}")

             if summary_df.empty or len(summary_df) == 0:
                 print(f"  Skipping empty summary for {key}")
                 continue

             # Identify grouping type and variables from the key
             primary_group_var = None
             secondary_group_var = None
             plot_title_suffix = ""

             if key.startswith("by_coding_model_then_"):
                 primary_group_var = 'coding_model'
                 secondary_group_var = key.replace("by_coding_model_then_", "")
                 plot_title_suffix = f" by {secondary_group_var.replace('_', ' ').title()}\n(Faceted by Coding Model)"
             elif key.startswith("by_generating_model_then_"):
                 primary_group_var = 'generating_model'
                 secondary_group_var = key.replace("by_generating_model_then_", "")
                 plot_title_suffix = f" by {secondary_group_var.replace('_', ' ').title()}\n(Faceted by Generating Model)"
             else:
                 # Single grouping variable
                 primary_group_var = key # The key itself is the grouping variable
                 plot_title_suffix = f" by {primary_group_var.replace('_', ' ').title()}"


             # Ensure necessary columns exist
             required_plot_cols = [primary_group_var]
             if secondary_group_var:
                 required_plot_cols.append(secondary_group_var)

             if not all(col in summary_df.columns for col in required_plot_cols):
                 print(f"  Warning: Missing grouping columns ({required_plot_cols}) in summary DF for key '{key}'. Skipping plot.")
                 continue

             # Convert grouping vars to string for plotting stability
             try:
                 for col in required_plot_cols:
                     summary_df[col] = summary_df[col].astype(str)
             except Exception as e:
                 print(f"  Warning: Could not convert grouping columns to string for key '{key}': {e}. Plotting might fail.")
                 continue

             # --- Prepare and Plot for Topics ---
             topic_cols = ['Precision (Topics)', 'Recall (Topics)', 'F1 (Topics)']
             if not all(col in summary_df.columns for col in topic_cols):
                  print(f"  Warning: Missing topic metric columns for {key}. Skipping topic plot.")
             else:
                  topic_metrics_df = summary_df[required_plot_cols + topic_cols].copy()
                  topic_metrics_df.rename(columns={'Precision (Topics)': 'Precision', 'Recall (Topics)': 'Recall', 'F1 (Topics)': 'F1'}, inplace=True)
                  try:
                       topic_melted = pd.melt(topic_metrics_df, id_vars=required_plot_cols, var_name='Metric Type', value_name='Score')
                       topic_melted['Metric Type'] = pd.Categorical(topic_melted['Metric Type'], categories=['Precision', 'Recall', 'F1'])
                  except Exception as melt_e:
                       print(f"  ERROR during topic data melting for {key}: {melt_e}")
                       continue

                  # --- Generate Topic Plot ---
                  try:
                      # Base plot
                      p = (ggplot(topic_melted, aes(x='Metric Type', y='Score', fill=secondary_group_var if secondary_group_var else primary_group_var)) \
                          + geom_col(position=position_dodge(width=0.9), na_rm=True) \
                          + labs(title=f'Topic Identification Metrics{plot_title_suffix}',
                                 x=(secondary_group_var if secondary_group_var else primary_group_var).replace('_', ' ').title(),
                                 y='Score (0-1)', fill='Metric') \
                          + ylim(0, 1.05) \
                          + theme_minimal(base_size=9) \
                          + theme(axis_text_x=element_text(angle=45, hjust=1, size=7),
                                  plot_title=element_text(size=11),
                                  strip_text=element_text(size=8), # Facet title size
                                  figure_size=(max(6, len(summary_df[secondary_group_var if secondary_group_var else primary_group_var].unique()) * (0.6 if secondary_group_var else 0.8)),
                                               4 * (len(summary_df[primary_group_var].unique()) if secondary_group_var else 1) )) # Adjust height for facets
                          + theme(plot_background=element_rect(fill='white'), panel_background=element_rect(fill='white'))
                          )

                      # Add facet if it's a nested grouping
                      if secondary_group_var and primary_group_var == 'coding_model':
                          p += facet_wrap('~coding_model', ncol=1) # Stack facets vertically
                      elif secondary_group_var and primary_group_var == 'generating_model':
                           p += facet_wrap('~generating_model', ncol=1) # Stack facets vertically

                      print(p) # Display the plot

                  except Exception as e:
                      print(f"  ERROR generating topic plot for {key}: {e}")


             # --- Prepare and Plot for Subtopics ---
             subtopic_cols = ['Precision (Subtopics)', 'Recall (Subtopics)', 'F1 (Subtopics)']
             if not all(col in summary_df.columns for col in subtopic_cols):
                  print(f"  Warning: Missing subtopic metric columns for {key}. Skipping subtopic plot.")
             else:
                  subtopic_metrics_df = summary_df[required_plot_cols + subtopic_cols].copy()
                  subtopic_metrics_df.rename(columns={'Precision (Subtopics)': 'Precision', 'Recall (Subtopics)': 'Recall', 'F1 (Subtopics)': 'F1'}, inplace=True)
                  try:
                       subtopic_melted = pd.melt(subtopic_metrics_df, id_vars=required_plot_cols, var_name='Metric Type', value_name='Score')
                       subtopic_melted['Metric Type'] = pd.Categorical(subtopic_melted['Metric Type'], categories=['Precision', 'Recall', 'F1'])
                  except Exception as melt_e:
                       print(f"  ERROR during subtopic data melting for {key}: {melt_e}")
                       continue

                  # --- Generate Subtopic Plot ---
                  try:
                      # Base plot
                      p = (ggplot(topic_melted, aes(x='Metric Type', y='Score', fill=secondary_group_var if secondary_group_var else primary_group_var)) \
                          + geom_col(position=position_dodge(width=0.9), na_rm=True) \
                          + labs(title=f'Subtopic Identification Metrics{plot_title_suffix}',
                                 x=(secondary_group_var if secondary_group_var else primary_group_var).replace('_', ' ').title(),
                                 y='Score (0-1)', fill='Metric') \
                          + ylim(0, 1.05) \
                          + theme_minimal(base_size=9) \
                          + theme(axis_text_x=element_text(angle=45, hjust=1, size=7),
                                  plot_title=element_text(size=11),
                                  strip_text=element_text(size=8), # Facet title size
                                  figure_size=(max(6, len(summary_df[secondary_group_var if secondary_group_var else primary_group_var].unique()) * (0.6 if secondary_group_var else 0.8)),
                                               4 * (len(summary_df[primary_group_var].unique()) if secondary_group_var else 1) )) # Adjust height for facets
                          + theme(plot_background=element_rect(fill='white'), panel_background=element_rect(fill='white'))
                          )
                      # Add facet if it's a nested grouping
                      if secondary_group_var and primary_group_var == 'coding_model':
                          p += facet_wrap('~coding_model', ncol=1) # Stack facets vertically
                      elif secondary_group_var and primary_group_var == 'generating_model':
                           p += facet_wrap('~generating_model', ncol=1) # Stack facets vertically

                      print(p) # Display the plot

                  except Exception as e:
                      print(f"  ERROR generating subtopic plot for {key}: {e}")

    print("\n--- Plotting Complete ---")

# --- Run Analysis and Plotting ---
analysis_summary = analyze_coding_results(INPUT_CODING_RESULTS_FILE)

plot_evaluation_metrics(analysis_summary)

