In [None]:
import pandas as pd
import numpy as np
import json
import requests
import time
import os
import re
from tqdm.auto import tqdm
import random
import tenacity
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from typing import Dict, List, Any, Tuple, Optional
import ast # Keep for potential fallback parsing if needed

# --- Configuration for Deductive Coding ---

# Define models TO USE FOR THE CODING TASK
# The script will iterate through these models to code each abstract
CODING_MODELS = {
    "llama-3.3-70b-versatile": {
        "provider": "groq",
        "model_name": "llama-3.3-70b-versatile",
        "max_tokens": 8000, # Max tokens for the coding response
        "temperature": 0.1, # Low temperature for consistent coding
    },
    "llama-3.1-8b-instant": {
        "provider": "groq",
        "model_name": "llama-3.1-8b-instant",
        "max_tokens": 8000,
        "temperature": 0.1,
    }
    # Add other models intended for coding here
}

# API configuration (using keys for the CODING models)
# Ensure the key here has access to the models defined in CODING_MODELS
API_CONFIG = {
    "groq": {
        "api_key": os.environ.get("GROQ_API_KEY_CODING") or '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', # Use a dedicated key if needed
        "api_url": "https://api.groq.com/openai/v1/chat/completions"
    },
    # Add configs for other providers if used in CODING_MODELS
}

# Input file from the synthetic generation script
INPUT_ABSTRACTS_FILE = "synthetic_abstracts_multi_model.feather"

# Output file for coding results
OUTPUT_CODING_RESULTS_FILE = "deductive_coding_results_multi_model.json"

# Max samples to process (set to None to process all)
MAX_SAMPLES_TO_CODE = None # e.g., 100 for testing, None for all

# --- Definitions from Synthetic Data Generation (for Codebook) ---
# Ensure these match the definitions used during generation
TOPICS = {
    "T1": {
        "name": "Machine Learning",
        "subtopics": ["Neural Networks", "Reinforcement Learning", "Supervised Learning", "Unsupervised Learning", "Transfer Learning"],
        "description": "Machine Learning involves developing algorithms and models that enable computers to learn from and make predictions or decisions based on data without being explicitly programmed."
    },
    "T7": {
        "name": "Sustainable Development",
        "subtopics": ["Renewable Energy", "Climate Change Mitigation", "Resource Management", "Environmental Monitoring", "Sustainable Cities"],
        "description": "Sustainable Development focuses on meeting present needs without compromising future generations, balancing economic growth, environmental protection, and social equity."
    },
    "T8": {
        "name": "Behavioral Economics",
        "subtopics": ["Decision Making", "Cognitive Biases", "Risk Assessment", "Social Preferences", "Intertemporal Choice"],
        "description": "Behavioral Economics studies how psychological, social, cognitive, and emotional factors influence economic decisions, challenging the assumption of perfect rationality."
    },
    "T9": {
        "name": "Digital Security",
        "subtopics": ["Cybersecurity", "Privacy Enhancing Technologies", "Authentication Methods", "Threat Detection", "Security Policy"],
        "description": "Digital Security encompasses technologies, protocols, and practices designed to protect computers, networks, programs, and data from attacks, damage, or unauthorized access."
    },
    "T10": {
        "name": "Public Health",
        "subtopics": ["Epidemiology", "Health Promotion", "Disease Prevention", "Health Equity", "Health Systems"],
        "description": "Public Health focuses on protecting and improving health at the population level through organized efforts, education, policies, and research."
    }
}
# Note: DOMAINS list is not directly used in the codebook creation here,
# but the domain value is extracted from diversity_params later.

# --- Helper Functions ---

def create_codebook():
    """Create a structured codebook for deductive coding based on TOPICS"""
    codebook = {
        "topics": {k: {
            "name": v["name"],
            "id": k,
            "subtopics": v["subtopics"],
            "description": v["description"]
        } for k, v in TOPICS.items()},
    }

    # Add disambiguation guidelines (same as before)
    codebook["disambiguation_guidelines"] = """
        When coding abstracts, carefully distinguish between topics and between topics and subtopics:

    TOPICS refer to the academic fields, subjects, or methodologies that form the theoretical or methodological foundation of the research. They answer "what knowledge area is being studied or applied?"

    SUBTOPICS refer a finer-grained version of the relevant TOPIC in the abstract

    For example, an abstract might describe using Machine Learning (TOPIC) to performed supervised learning (SUBTOPIC).
    Here, Machine Learning is the broad academic subject being discussed and supervised learning is the more fine-graied topic being discussed.

    Evidence for topics typically includes:
    - Specific methodologies, theories, or frameworks from that academic field
    - Technical terminology associated with the discipline
    - Citations or references to literature in that field


    Evidence for subtopics typically includes:
    - Evidence of less granular topic
    - Specific methodologies, theories, or frameworks from that academic field
    - Technical terminology associated with the discipline
    - Citations or references to literature in that field

    Be aware that sometimes terminology can overlap.
    """
    return codebook

def create_deductive_coding_prompt(codebook, abstract_text):
    """Create a prompt for deductive coding given the codebook and abstract text."""
    # Prepare JSON representations of codebook parts for the prompt
    topics_json = json.dumps([{"id": t_id, "name": t_info["name"], "description": t_info["description"], "subtopics": t_info["subtopics"]}
                              for t_id, t_info in codebook["topics"].items()], indent=2)

    # Construct the prompt using an f-string
    # IMPORTANT: Use double curly braces `{{` and `}}` to escape literal braces
    # needed for the example JSON format within the f-string.
    prompt = f"""
You are a highly skilled research methodologist performing deductive coding on academic abstracts. Your task is to analyze the following abstract and systematically identify both TOPICS and SUBTOPICS based on a predefined codebook.

# CODEBOOK

## Topics
```json
{topics_json}
```

## DISAMBIGUATION GUIDELINES
{codebook["disambiguation_guidelines"]}

# CODING TASK

Analyze the following abstract:

---
{abstract_text}
---

Perform the following analysis:

1.  **TOPIC IDENTIFICATION**:
    * Identify ALL topics (from the codebook) that are present in the abstract.
    * For EACH identified topic, estimate the proportion (percentage) of the abstract text devoted to it. Proportions must sum to 100%. If a topic is present but minor, assign a small percentage (e.g., 5%). If no topics are identifiable, return an empty list for "topics".
    * Rate your confidence for EACH topic identification on a scale of 1 (low) to 5 (high).

2.  **SUBTOPIC IDENTIFICATION**:
    * Identify ONE subtopic per identified topic (from the codebook topics' subtopic lists) that are represented in the abstract. List the subtopic name and the ID of the parent topic it belongs to.
    * Rate your confidence for EACH subtopic identification on a scale of 1 (low) to 5 (high). If no subtopics are identifiable, return an empty list for "subtopics".

3.  **DISAMBIGUATION EXPLANATION**:
    * Briefly explain your reasoning for distinguishing between the identified topics and subtopics, especially in potentially ambiguous cases, referencing the disambiguation guidelines. If only one topic/subtopic is found, state that. If none are found, state that.

Return your analysis **ONLY** in the following valid JSON format. Do **NOT** include any text outside this JSON structure, comments, or markdown formatting (like ```json).

```json
{{
  "topics": [
    {{
      "topic_id": "topic_identifier_from_codebook",
      "topic_name": "topic_name_from_codebook",
      "proportion": <percentage_as_number_0_to_100>,
      "confidence": <rating_from_1_to_5>
    }}
  ],
  "subtopics": [
    {{
      "parent_topic_id": "topic_identifier_from_codebook",
      "subtopic_name": "subtopic_name_from_codebook",
      "confidence": <rating_from_1_to_5>
    }}
  ],
  "disambiguation_explanation": "Your explanation here, referencing the guidelines."
}}
```

**Important Instructions:**
* Base your analysis STRICTLY on the provided abstract text. Do not infer information not present.
* Ensure the topic proportions sum to 100%.
* If you cannot identify any topics or subtopics, return empty lists `[]` for the respective keys.
* Provide a concise `disambiguation_explanation`.
* Output **only** the valid JSON object.
"""
    return prompt


# API request handlers with retry logic
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10),
       retry=retry_if_exception_type((requests.RequestException, json.JSONDecodeError)))
def query_groq_api(prompt, model_config, api_key, api_url):
    """Send a request to Groq API with retry logic"""
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model_config["model_name"],
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": model_config["max_tokens"],
        "temperature": model_config["temperature"],
        "response_format": {"type": "json_object"} # Request JSON output directly
    }

    response = requests.post(api_url, headers=headers, json=payload, timeout=90) # Increased timeout
    response.raise_for_status() # Raise an exception for 4XX/5XX responses

    result = response.json()
    if "choices" in result and len(result["choices"]) > 0:
        # Check if content is already a JSON object due to response_format
        content = result["choices"][0]["message"]["content"]
        # Try parsing, as sometimes it might still be a string despite the request
        try:
            # If it's already parsed by requests, it might be a dict
            if isinstance(content, dict):
                return json.dumps(content) # Return as JSON string for consistency
            else:
                 # Attempt to parse if it's a string
                 parsed_content = json.loads(content)
                 return json.dumps(parsed_content) # Return as JSON string
        except json.JSONDecodeError:
             # If parsing fails, return the raw string content for later parsing attempt
             print(f"Warning: Groq response content was not valid JSON despite request: {content[:100]}...")
             return content # Return raw string
    else:
        raise ValueError(f"Unexpected response format from Groq: {result}")


def query_model(prompt: str, coding_model_name: str) -> str:
    """
    Query the appropriate model API based on the coding model name.

    Args:
        prompt: The prompt string for the coding task.
        coding_model_name: The name of the model (key in CODING_MODELS) to use.

    Returns:
        str: The raw response content from the API (expected to be JSON or contain JSON).

    Raises:
        ValueError: If the provider or model configuration is invalid.
        Exception: Propagates exceptions from the underlying API call functions.
    """
    if coding_model_name not in CODING_MODELS:
        raise ValueError(f"Model '{coding_model_name}' not found in CODING_MODELS configuration.")

    model_config = CODING_MODELS[coding_model_name]
    provider = model_config["provider"]

    if provider == "groq":
        if provider not in API_CONFIG or not API_CONFIG[provider].get("api_key"):
            raise ValueError(f"API key for provider '{provider}' not found in API_CONFIG.")
        api_key = API_CONFIG[provider]["api_key"]
        api_url = API_CONFIG[provider].get("api_url", "[https://api.groq.com/openai/v1/chat/completions](https://api.groq.com/openai/v1/chat/completions)")
        return query_groq_api(prompt, model_config, api_key, api_url)
    # Add elif blocks for other providers here if needed
    # elif provider == "openai":
    #     # Call your OpenAI query function
    #     pass
    else:
        raise ValueError(f"Unsupported provider configured for model '{coding_model_name}': {provider}")


def extract_json_from_response(response_text: str) -> Dict[str, Any]:
    """
    Extracts JSON object from a string, handling potential surrounding text or markdown.

    Args:
        response_text: The raw string response from the LLM.

    Returns:
        A dictionary containing the parsed JSON data, or an error dictionary if parsing fails.
    """
    if not isinstance(response_text, str):
        return {"error": "Invalid input: response_text must be a string.", "raw_response": str(response_text)}

    # 1. Try direct JSON parsing first (ideal case, especially with response_format)
    try:
        return json.loads(response_text)
    except json.JSONDecodeError:
        # print("Direct JSON parsing failed, attempting extraction...") # Optional debug info
        pass # Continue to extraction logic

    # 2. Remove potential markdown wrappers (```json ... ``` or ``` ... ```)
    cleaned_text = re.sub(r'^```(?:json)?\s*', '', response_text.strip(), flags=re.MULTILINE)
    cleaned_text = re.sub(r'\s*```$', '', cleaned_text, flags=re.MULTILINE)
    cleaned_text = cleaned_text.strip()

    # 3. Find the outermost curly braces
    start_index = cleaned_text.find('{')
    end_index = cleaned_text.rfind('}')

    if start_index != -1 and end_index != -1 and end_index > start_index:
        json_str = cleaned_text[start_index : end_index + 1]
        try:
            # 4. Attempt to parse the extracted string
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            # print(f"Extraction parsing failed: {e}") # Optional debug info
            # Attempt to fix trailing commas as a common issue
            json_str_fixed = re.sub(r',\s*([}\]])', r'\1', json_str)
            try:
                return json.loads(json_str_fixed)
            except json.JSONDecodeError:
                 return {"error": f"Could not parse extracted JSON: {e}", "extracted_snippet": json_str[:200], "raw_response": response_text[:500]}
    else:
        # No clear JSON structure found after cleaning
        return {"error": "No valid JSON structure found in response", "raw_response": response_text[:500]}


# --- Main Execution Logic ---

print("Starting Deductive Coding Process...")

# Load the generated abstracts data
try:
    abstracts_df = pd.read_feather(INPUT_ABSTRACTS_FILE)
    print(f"Successfully loaded {len(abstracts_df)} rows from {INPUT_ABSTRACTS_FILE}")
except FileNotFoundError:
    print(f"ERROR: Input file not found at {INPUT_ABSTRACTS_FILE}. Please ensure the generation script ran successfully.")
    exit()
except Exception as e:
    print(f"ERROR: Failed to load or read {INPUT_ABSTRACTS_FILE}: {e}")
    exit()

# Verify essential columns exist from the generation script
required_cols = ['id', 'provider', 'model', 'topic_mix', 'selected_subtopics', 'diversity_params']
missing_cols = [col for col in required_cols if col not in abstracts_df.columns]
if missing_cols:
    print(f"ERROR: Input DataFrame is missing required columns: {', '.join(missing_cols)}. Check the generation script output.")
    exit()

# Data Type Check/Conversion (Optional but recommended)
# Feather usually preserves types, but check `diversity_params`
if not all(isinstance(x, (dict, type(None))) for x in abstracts_df['diversity_params'] if pd.notna(x)):
     print("Warning: 'diversity_params' column contains non-dictionary elements. Attempting safe parsing...")
     # Define a safe parsing function if needed (e.g., using ast.literal_eval)
     def safe_parse_dict(val):
         if isinstance(val, dict) or pd.isna(val):
             retMstance(val, str):
             try:
                 parsed = ast.literal_eval(val)
                 return parsed if isinstance(parsed, dict) else None
             except:
                 return None
         return None
     abstracts_df['diversity_params'] = abstracts_df['diversity_params'].apply(safe_parse_dict)
     print("Completed safe parsing attempt for 'diversity_params'.")


# Create the codebook
codebook = create_codebook()
print("Codebook created.")

# Select samples (all or a subset)
if MAX_SAMPLES_TO_CODE is not None and MAX_SAMPLES_TO_CODE < len(abstracts_df):
    selected_df = abstracts_df.sample(n=MAX_SAMPLES_TO_CODE, random_state=42).copy()
    print(f"Selected {len(selected_df)} random samples for coding.")
else:
    selected_df = abstracts_df.copy()
    print(f"Selected all {len(selected_df)} rows for coding.")


coding_results_list = []

# Process each selected abstract row
print("Starting coding loop...")
for idx, row in tqdm(selected_df.iterrows(), total=len(selected_df), desc="Coding Abstracts"):
        generating_provider = row.get('provider')
        generating_model = row.get('model')
        abstract_id = row.get('id') # Original variation ID

        # Basic check for provider info
        if pd.isna(generating_provider):
            # print(f"Skipping row index {idx} due to missing provider.") # Optional verbose logging
            continue

        # --- Check if the abstract generation was successful ---
        error_col = f"{generating_provider}_error"
        # Check if error column exists and if it has a non-NA value indicating an error
        if error_col in row and not pd.isna(row[error_col]):
            # print(f"Skipping row index {idx} (ID: {abstract_id}, GenModel: {generating_model}) due to generation error.")
            continue

        # --- Get the generated abstract text ---
        abstract_col = f"{generating_provider}_abstract"
        if abstract_col not in row or pd.isna(row[abstract_col]):
            # print(f"Skipping row index {idx} (ID: {abstract_id}, GenModel: {generating_model}) due to missing/empty abstract text.")
            continue
        abstract_text = row[abstract_col]

        # --- Prepare Ground Truth Information ---
        # Ensure complex types like dicts are handled correctly (Feather should help)
        diversity_params_val = row.get('diversity_params')
        domain_val = ""
        if isinstance(diversity_params_val, dict):
            domain_val = diversity_params_val.get("domain", "")

        ground_truth = {
            "original_variation_id": abstract_id,
            "generating_model": generating_model,
            "generating_provider": generating_provider,
            "topic_mix": row.get('topic_mix'), # Should be dict
            "selected_subtopics": row.get('selected_subtopics'), # Should be dict
            "diversity_params": diversity_params_val, # Should be dict or None
            "domain": domain_val,
            "diversity_type": row.get('diversity_type'),
            "abstract_length_setting": row.get('abstract_length'),
            "allow_topic_mention": row.get('allow_topic_mention'),
            "allow_subtopic_mention": row.get('allow_subtopic_mention'),
            # Add any other relevant ground truth columns from the input df
            # e.g., 'topic_spec_id': row.get('topic_spec_id')
        }

        # --- Perform Coding with each configured CODING model ---
        for coding_model_name in CODING_MODELS.keys():
            # print(f"  Coding Abstract ID {abstract_id} (Gen: {generating_model}) using Coder: {coding_model_name}") # Verbose
            
                        
            # Create the prompt for this abstract and codebook
            prompt = create_deductive_coding_prompt(codebook, abstract_text)

            try:
                
                # Add a small delay between API calls
                time.sleep(random.uniform(2.0, 3.0))


                if coding_model_name== "llama-3.3-70b-versatile":
                    coding_model_name =  "llama-3.3-70b-specdec"


                # Query the coding model
                raw_response = query_model(prompt, coding_model_name)

                # Parse the JSON response
                coding_result_json = extract_json_from_response(raw_response)

                # Store the results
                result_entry = {
                    "original_variation_id": abstract_id,
                    "generating_model": generating_model,
                    "generating_provider": generating_provider,
                    "coding_model": coding_model_name, # Model used for coding
                    "abstract_text_coded": abstract_text, # The actual text coded
                    "ground_truth": ground_truth, # Original parameters & info
                    # "coding_prompt": prompt, # Optional: store prompt if needed, can be large
                    "coding_result": coding_result_json, # Parsed JSON result from coding model
                    # "coding_raw_response": raw_response, # Optional: store raw response if needed
                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
                }
                coding_results_list.append(result_entry)


            except Exception as e:
                # Construct a detailed error message
                error_message = f"Coding Exception: {type(e).__name__} - {str(e)}"

                print(f"\nERROR processing: {error_message}")

                # Log error information (as before)
                current_results_lookup[task_id] = {
                    "original_variation_id": abstract_id, "generating_model": generating_model,
                    "generating_provider": generating_provider, "coding_model": coding_model_name,
                    "abstract_text_coded": abstract_text, "ground_truth": ground_truth,
                    "error": error_message, # Store the detailed error
                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                    "status": "processed_error"
                }
        # --- Periodic Save ---
        if (idx + 1) % 20 == 0: # Save every 20 rows processed
            try:
                with open(OUTPUT_CODING_RESULTS_FILE, 'w', encoding='utf-8') as f:
                    json.dump(coding_results_list, f, indent=2)
                # print(f"\n--- Periodically saved {len(coding_results_list)} results to {OUTPUT_CODING_RESULTS_FILE} ---")
            except Exception as save_e:
                print(f"\nWarning: Periodic save failed: {save_e}")
               
            

In [None]:

# --- Final Save ---
print(f"\nFinished processing. Saving final {len(coding_results_list)} results...")
try:
    with open(OUTPUT_CODING_RESULTS_FILE, 'w', encoding='utf-8') as f:
        json.dump(coding_results_list, f, indent=2)
    print(f"Successfully saved final results to {OUTPUT_CODING_RESULTS_FILE}")
except Exception as save_e:
    print(f"ERROR: Final save failed: {save_e}")

print("Deductive Coding Process Complete.")

