"""
prompts.py — System prompt, few-shot examples, and schema documentation.

Used by model_inference.py to assemble the full prompt before inference.
"""

# ── System prompt (schema-agnostic rules) ──────────────────────────────

SYSTEM_PROMPT = """You are an expert DuckDB SQL developer for school district administration.
Generate ONLY valid DuckDB SQL queries wrapped in ```sql ``` markdown blocks.
Follow these rules strictly:

1. Only SELECT statements — never INSERT, UPDATE, DELETE, DROP, ALTER, TRUNCATE.
2. Use the exact table and column names from the schema below.
3. Use proper DuckDB syntax: VARCHAR comparisons use single quotes, BOOLEAN checks use TRUE/FALSE.
4. When aggregating, use clear column aliases (e.g., AS total_students).
5. Join on the columns that logically connect the tables — infer the join key from schema.
6. If the question is ambiguous, make a reasonable assumption and generate the query.
7. Do NOT include any explanation outside the ```sql ``` block."""


# ── Schema documentation builder ───────────────────────────────────────

def build_schema_context(tables: dict[str, list[tuple[str, str, str]]]) -> str:
    """
    Build a formatted schema context string for prompt injection.

    Args:
        tables: dict of table_name -> list of (column_name, type, description)

    Returns:
        Formatted string like:
            Table: enrollment
              school_year VARCHAR — The school year (format 'YYYY-YYYY')
              ...
    """
    lines = ["Available Tables & Schemas:"]
    for idx, (table_name, columns) in enumerate(tables.items(), 1):
        lines.append(f"\n{idx}. Table: {table_name}")
        for col_name, col_type, col_desc in columns:
            lines.append(f"   {col_name} ({col_type}) — {col_desc}")
    return "\n".join(lines)


# ── Default seed schema (matches Phase 4 data) ─────────────────────────

DEFAULT_SCHEMA = {
    "enrollment": [
        ("school_year", "VARCHAR", "The school year (format 'YYYY-YYYY')"),
        ("school_name", "VARCHAR", "Name of the school"),
        ("grade_level", "INTEGER", "Grade level (K=0, 1-12)"),
        ("student_count", "INTEGER", "Number of students enrolled in that grade"),
    ],
    "attendance": [
        ("student_id", "INTEGER", "Unique student identifier"),
        ("school_name", "VARCHAR", "Name of the school the student attends"),
        ("school_year", "VARCHAR", "The school year (format 'YYYY-YYYY')"),
        ("absence_count", "INTEGER", "Total absences for the school year"),
        ("is_chronically_absent", "BOOLEAN", "TRUE if student missed >= 10% of school days"),
    ],
    "students": [
        ("student_id", "INTEGER", "Unique student identifier (joins to attendance, discipline, grades)"),
        ("school_name", "VARCHAR", "Name of the school the student attends"),
        ("grade_level", "INTEGER", "Current grade level (K=0, 1-12)"),
        ("gender", "VARCHAR", "'F' or 'M'"),
        ("race_ethnicity", "VARCHAR", "Race/ethnicity category (e.g., 'Hispanic/Latino', 'White', 'Black/African American', 'Asian', etc.)"),
        ("english_learner", "BOOLEAN", "TRUE if student is classified as an English Learner (EL/ELL)"),
        ("special_education", "BOOLEAN", "TRUE if student has an IEP / receives special education services"),
        ("economically_disadvantaged", "BOOLEAN", "TRUE if student qualifies for free/reduced lunch (FRL)"),
    ],
    "discipline": [
        ("incident_id", "INTEGER", "Unique incident identifier"),
        ("student_id", "INTEGER", "Student involved (joins to students)"),
        ("school_name", "VARCHAR", "School where incident occurred"),
        ("school_year", "VARCHAR", "The school year (format 'YYYY-YYYY')"),
        ("grade_level", "INTEGER", "Student's grade at time of incident"),
        ("incident_type", "VARCHAR", "Type: 'Defiance', 'Fighting', 'Bullying', 'Vandalism', 'Disruption', 'Insubordination', 'Theft', 'Substance', 'Harassment', 'Verbal Altercation'"),
        ("incident_date", "VARCHAR", "Date of incident (format 'YYYY-MM-DD')"),
        ("severity", "VARCHAR", "'Minor', 'Major', or 'Severe'"),
        ("action_taken", "VARCHAR", "'Warning', 'Detention', 'Suspension', or 'Expulsion'"),
        ("days_suspended", "INTEGER", "Number of days suspended (0 if no suspension)"),
    ],
    "grades": [
        ("student_id", "INTEGER", "Student (joins to students)"),
        ("school_name", "VARCHAR", "School"),
        ("school_year", "VARCHAR", "The school year (format 'YYYY-YYYY')"),
        ("grade_level", "INTEGER", "Grade level"),
        ("course_name", "VARCHAR", "'English', 'Math', 'Science', 'Social Studies', 'PE', 'Art', 'Music'"),
        ("term", "VARCHAR", "'Fall' or 'Spring'"),
        ("letter_grade", "VARCHAR", "'A', 'B', 'C', 'D', or 'F'"),
        ("grade_numeric", "DOUBLE", "Numeric grade: A=4.0, B=3.0, C=2.0, D=1.0, F=0.0"),
        ("gpa", "DOUBLE", "Term GPA (mean of all course grade_numeric for that term)"),
    ],
}


# ── Few-shot examples ──────────────────────────────────────────────────

FEW_SHOT_EXAMPLES = [
    {
        "question": "How many students were chronically absent in 2023-2024 across all schools?",
        "sql": "SELECT COUNT(*) AS chronically_absent_count\nFROM attendance\nWHERE school_year = '2023-2024' AND is_chronically_absent = TRUE;",
    },
    {
        "question": "Show total enrollment per school for 2024-2025, sorted highest first.",
        "sql": "SELECT school_name, SUM(student_count) AS total_enrollment\nFROM enrollment\nWHERE school_year = '2024-2025'\nGROUP BY school_name\nORDER BY total_enrollment DESC;",
    },
    {
        "question": "How many chronically absent students are English Learners in 2023-2024?",
        "sql": "SELECT COUNT(DISTINCT a.student_id) AS chronic_ell_count\nFROM attendance a\nJOIN students s ON a.student_id = s.student_id\nWHERE a.school_year = '2023-2024'\n  AND a.is_chronically_absent = TRUE\n  AND s.english_learner = TRUE;",
    },
    {
        "question": "What's the average GPA for chronically absent students vs non-chronic students in 2023-2024?",
        "sql": "SELECT\n  a.is_chronically_absent,\n  ROUND(AVG(g.gpa), 2) AS avg_gpa\nFROM attendance a\nJOIN grades g ON a.student_id = g.student_id AND a.school_year = g.school_year\nWHERE a.school_year = '2023-2024'\nGROUP BY a.is_chronically_absent;",
    },
]


def build_few_shot_block(examples: list[dict] = None) -> str:
    """Build the few-shot examples block for prompt injection."""
    if examples is None:
        examples = FEW_SHOT_EXAMPLES
    lines = ["Examples:"]
    for ex in examples:
        lines.append(f"\nQuestion: {ex['question']}")
        lines.append(f"```sql\n{ex['sql']}\n```")
    return "\n".join(lines)


# ── Full prompt assembler ──────────────────────────────────────────────

def build_prompt(
    user_question: str,
    schema: dict = None,
    examples: list[dict] = None,
) -> str:
    """
    Assemble the full prompt for the fine-tuned Qwen2.5 LLM.

    Uses the Qwen2.5 chat template (matching training format):
    <|im_start|>system ... <|im_end|>
    <|im_start|>user Question: ... <|im_end|>
    <|im_start|>assistant

    Schema and few-shot examples are embedded in the system prompt
    since the fine-tuned model was trained with system + question → SQL.
    """
    if schema is None:
        schema = DEFAULT_SCHEMA
    if examples is None:
        examples = FEW_SHOT_EXAMPLES

    # Build system prompt with schema and examples embedded
    system = SYSTEM_PROMPT + "\n\n" + build_schema_context(schema)
    if examples:
        system += "\n\n" + build_few_shot_block(examples)

    # Qwen2.5 chat template
    prompt = (
        f"<|im_start|>system\n{system}<|im_end|>\n"
        f"<|im_start|>user\nQuestion: {user_question}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )
    return prompt