""" prompts.py — System prompt, few-shot examples, and schema documentation. Used by model_inference.py to assemble the full prompt before inference. """ # ── System prompt (schema-agnostic rules) ────────────────────────────── SYSTEM_PROMPT = """You are an expert DuckDB SQL developer for school district administration. Generate ONLY valid DuckDB SQL queries wrapped in ```sql ``` markdown blocks. Follow these rules strictly: 1. Only SELECT statements — never INSERT, UPDATE, DELETE, DROP, ALTER, TRUNCATE. 2. Use the exact table and column names from the schema below. 3. Use proper DuckDB syntax: VARCHAR comparisons use single quotes, BOOLEAN checks use TRUE/FALSE. 4. When aggregating, use clear column aliases (e.g., AS total_students). 5. Join on the columns that logically connect the tables — infer the join key from schema. 6. If the question is ambiguous, make a reasonable assumption and generate the query. 7. Do NOT include any explanation outside the ```sql ``` block.""" # ── Schema documentation builder ─────────────────────────────────────── def build_schema_context(tables: dict[str, list[tuple[str, str, str]]]) -> str: """ Build a formatted schema context string for prompt injection. Args: tables: dict of table_name -> list of (column_name, type, description) Returns: Formatted string like: Table: enrollment school_year VARCHAR — The school year (format 'YYYY-YYYY') ... """ lines = ["Available Tables & Schemas:"] for idx, (table_name, columns) in enumerate(tables.items(), 1): lines.append(f"\n{idx}. Table: {table_name}") for col_name, col_type, col_desc in columns: lines.append(f" {col_name} ({col_type}) — {col_desc}") return "\n".join(lines) # ── Default seed schema (matches Phase 4 data) ───────────────────────── DEFAULT_SCHEMA = { "enrollment": [ ("school_year", "VARCHAR", "The school year (format 'YYYY-YYYY')"), ("school_name", "VARCHAR", "Name of the school"), ("grade_level", "INTEGER", "Grade level (K=0, 1-12)"), ("student_count", "INTEGER", "Number of students enrolled in that grade"), ], "attendance": [ ("student_id", "INTEGER", "Unique student identifier"), ("school_name", "VARCHAR", "Name of the school the student attends"), ("school_year", "VARCHAR", "The school year (format 'YYYY-YYYY')"), ("absence_count", "INTEGER", "Total absences for the school year"), ("is_chronically_absent", "BOOLEAN", "TRUE if student missed >= 10% of school days"), ], "students": [ ("student_id", "INTEGER", "Unique student identifier (joins to attendance, discipline, grades)"), ("school_name", "VARCHAR", "Name of the school the student attends"), ("grade_level", "INTEGER", "Current grade level (K=0, 1-12)"), ("gender", "VARCHAR", "'F' or 'M'"), ("race_ethnicity", "VARCHAR", "Race/ethnicity category (e.g., 'Hispanic/Latino', 'White', 'Black/African American', 'Asian', etc.)"), ("english_learner", "BOOLEAN", "TRUE if student is classified as an English Learner (EL/ELL)"), ("special_education", "BOOLEAN", "TRUE if student has an IEP / receives special education services"), ("economically_disadvantaged", "BOOLEAN", "TRUE if student qualifies for free/reduced lunch (FRL)"), ], "discipline": [ ("incident_id", "INTEGER", "Unique incident identifier"), ("student_id", "INTEGER", "Student involved (joins to students)"), ("school_name", "VARCHAR", "School where incident occurred"), ("school_year", "VARCHAR", "The school year (format 'YYYY-YYYY')"), ("grade_level", "INTEGER", "Student's grade at time of incident"), ("incident_type", "VARCHAR", "Type: 'Defiance', 'Fighting', 'Bullying', 'Vandalism', 'Disruption', 'Insubordination', 'Theft', 'Substance', 'Harassment', 'Verbal Altercation'"), ("incident_date", "VARCHAR", "Date of incident (format 'YYYY-MM-DD')"), ("severity", "VARCHAR", "'Minor', 'Major', or 'Severe'"), ("action_taken", "VARCHAR", "'Warning', 'Detention', 'Suspension', or 'Expulsion'"), ("days_suspended", "INTEGER", "Number of days suspended (0 if no suspension)"), ], "grades": [ ("student_id", "INTEGER", "Student (joins to students)"), ("school_name", "VARCHAR", "School"), ("school_year", "VARCHAR", "The school year (format 'YYYY-YYYY')"), ("grade_level", "INTEGER", "Grade level"), ("course_name", "VARCHAR", "'English', 'Math', 'Science', 'Social Studies', 'PE', 'Art', 'Music'"), ("term", "VARCHAR", "'Fall' or 'Spring'"), ("letter_grade", "VARCHAR", "'A', 'B', 'C', 'D', or 'F'"), ("grade_numeric", "DOUBLE", "Numeric grade: A=4.0, B=3.0, C=2.0, D=1.0, F=0.0"), ("gpa", "DOUBLE", "Term GPA (mean of all course grade_numeric for that term)"), ], } # ── Few-shot examples ────────────────────────────────────────────────── FEW_SHOT_EXAMPLES = [ { "question": "How many students were chronically absent in 2023-2024 across all schools?", "sql": "SELECT COUNT(*) AS chronically_absent_count\nFROM attendance\nWHERE school_year = '2023-2024' AND is_chronically_absent = TRUE;", }, { "question": "Show total enrollment per school for 2024-2025, sorted highest first.", "sql": "SELECT school_name, SUM(student_count) AS total_enrollment\nFROM enrollment\nWHERE school_year = '2024-2025'\nGROUP BY school_name\nORDER BY total_enrollment DESC;", }, { "question": "How many chronically absent students are English Learners in 2023-2024?", "sql": "SELECT COUNT(DISTINCT a.student_id) AS chronic_ell_count\nFROM attendance a\nJOIN students s ON a.student_id = s.student_id\nWHERE a.school_year = '2023-2024'\n AND a.is_chronically_absent = TRUE\n AND s.english_learner = TRUE;", }, { "question": "What's the average GPA for chronically absent students vs non-chronic students in 2023-2024?", "sql": "SELECT\n a.is_chronically_absent,\n ROUND(AVG(g.gpa), 2) AS avg_gpa\nFROM attendance a\nJOIN grades g ON a.student_id = g.student_id AND a.school_year = g.school_year\nWHERE a.school_year = '2023-2024'\nGROUP BY a.is_chronically_absent;", }, ] def build_few_shot_block(examples: list[dict] = None) -> str: """Build the few-shot examples block for prompt injection.""" if examples is None: examples = FEW_SHOT_EXAMPLES lines = ["Examples:"] for ex in examples: lines.append(f"\nQuestion: {ex['question']}") lines.append(f"```sql\n{ex['sql']}\n```") return "\n".join(lines) # ── Full prompt assembler ────────────────────────────────────────────── def build_prompt( user_question: str, schema: dict = None, examples: list[dict] = None, ) -> str: """ Assemble the full prompt for the fine-tuned Qwen2.5 LLM. Uses the Qwen2.5 chat template (matching training format): <|im_start|>system ... <|im_end|> <|im_start|>user Question: ... <|im_end|> <|im_start|>assistant Schema and few-shot examples are embedded in the system prompt since the fine-tuned model was trained with system + question → SQL. """ if schema is None: schema = DEFAULT_SCHEMA if examples is None: examples = FEW_SHOT_EXAMPLES # Build system prompt with schema and examples embedded system = SYSTEM_PROMPT + "\n\n" + build_schema_context(schema) if examples: system += "\n\n" + build_few_shot_block(examples) # Qwen2.5 chat template prompt = ( f"<|im_start|>system\n{system}<|im_end|>\n" f"<|im_start|>user\nQuestion: {user_question}<|im_end|>\n" f"<|im_start|>assistant\n" ) return prompt