"""
Scenario registry — 8 EU AI Act compliance audit scenarios across 3 difficulty tiers.

Investigation-grade: Each tool returns realistic regulatory documents that
require analysis to identify violations. No pre-digested verdicts — the agent
must reason about the evidence to find compliance gaps.

Easy (2):   Clear-cut systems, shorter documents, obvious violations
Medium (3): Detailed documents with statistical evidence, red herrings mixed in
Hard (3):   Ambiguous framing, misleading deployer claims, compound violations
"""

from __future__ import annotations

import random
from typing import Dict, List, Optional

from server.engine import AuditScenario, StateGraph, StateNode, Transition


# ---------------------------------------------------------------------------
# Unique state graph builder
# ---------------------------------------------------------------------------

def _build_scenario_graph(
    investigation_tools: List[str],
    is_prohibited: bool = False,
) -> StateGraph:
    """Build a state graph unique to this scenario's investigation path.

    Only tools in `investigation_tools` create progress transitions through
    investigation nodes. Other investigation tools are allowed but produce
    no_effect. This gives each scenario a distinct graph topology.

    Args:
        investigation_tools: Ordered list of investigation tool names forming
            the progress path (e.g. ["check_documentation", "audit_training_data"]).
        is_prohibited: If True, classification leads directly to findings
            (no extended investigation needed for prohibited systems).
    """
    g = StateGraph()

    # Investigation tool → node mapping
    TOOL_NODES = {
        "check_documentation": ("docs_reviewed", "Documentation Reviewed"),
        "audit_training_data": ("data_audited", "Training Data Audited"),
        "verify_human_oversight": ("oversight_checked", "Human Oversight Verified"),
        "check_transparency": ("transparency_checked", "Transparency Checked"),
        "assess_risk_management": ("risk_assessed", "Risk Management Assessed"),
        "check_logging": ("logging_checked", "Logging Verified"),
    }

    ALL_INVESTIGATION_TOOLS = list(TOOL_NODES.keys())

    # Always-present nodes
    g.add_node(StateNode("initial", "Audit Assigned", is_start=True))
    g.add_node(StateNode("overview", "System Overview Gathered"))
    g.add_node(StateNode("classified", "Risk Classification Done"))
    g.add_node(StateNode("findings_submitted", "Findings Submitted"))
    g.add_node(StateNode("remediation_proposed", "Remediation Recommended"))
    g.add_node(StateNode("resolved", "Compliance Verified", is_terminal=True))

    # Add nodes only for tools in the investigation path
    for tool in investigation_tools:
        node_id, label = TOOL_NODES[tool]
        g.add_node(StateNode(node_id, label))

    # --- Build progress chain ---
    # initial → overview → classified → [investigation tools...] → findings → remediation → resolved
    g.add_transition(Transition("initial", "overview", "get_system_overview", "progress",
        description="Gather system overview and deployment context"))
    g.add_transition(Transition("overview", "classified", "classify_system", "progress",
        description="Classify the AI system risk category"))

    if is_prohibited:
        # Prohibited: classify → findings directly
        g.add_transition(Transition("classified", "findings_submitted", "submit_finding", "progress",
            description="Report prohibited AI system"))
    else:
        # Chain investigation tools in order
        prev_state = "classified"
        for tool in investigation_tools:
            node_id = TOOL_NODES[tool][0]
            g.add_transition(Transition(prev_state, node_id, tool, "progress",
                description=f"Progress: {tool}"))
            prev_state = node_id
        g.add_transition(Transition(prev_state, "findings_submitted", "submit_finding", "progress",
            description="Submit compliance findings"))

    g.add_transition(Transition("findings_submitted", "remediation_proposed", "recommend_fix", "progress",
        description="Propose remediation actions"))
    g.add_transition(Transition("remediation_proposed", "resolved", "verify_compliance", "progress",
        description="Final compliance determination"))

    # --- No-effect transitions (investigation tools not in the path) ---
    # These tools work (return data) but don't advance the state graph
    all_progress_states = ["initial", "overview", "classified"]
    for tool in investigation_tools:
        all_progress_states.append(TOOL_NODES[tool][0])

    for tool in ALL_INVESTIGATION_TOOLS:
        if tool not in investigation_tools:
            # Can call from any state, no effect on graph
            for state_id in all_progress_states:
                g.add_transition(Transition(state_id, state_id, tool, "no_effect",
                    description=f"{tool} not required for this scenario"))

    # --- No-effect for premature terminal actions ---
    for state in all_progress_states:
        if state not in ("findings_submitted", "remediation_proposed"):
            g.add_transition(Transition(state, state, "verify_compliance", "no_effect",
                description="Cannot verify before completing audit"))

    # --- Worsened transitions (trap actions) ---
    g.add_transition(Transition("initial", "initial", "classify_system", "worsened",
        description="Classifying without understanding the system first"))
    for state in ["initial", "overview", "classified"]:
        g.add_transition(Transition(state, state, "recommend_fix", "worsened",
            description="Proposing fixes before identifying problems"))
    for state in ["initial", "overview"]:
        g.add_transition(Transition(state, state, "submit_finding", "no_effect",
            description="Submitting findings before investigation"))

    return g


# ---------------------------------------------------------------------------
# EASY SCENARIOS
# ---------------------------------------------------------------------------

def _easy_chatbot() -> AuditScenario:
    s = AuditScenario(
        scenario_id="easy_chatbot_transparency_001",
        title="Customer Service Chatbot — Transparency Audit",
        difficulty="easy",
        description=(
            "A customer service chatbot deployed on a retail website needs compliance "
            "review. The system uses a general-purpose LLM to answer product questions. "
            "Initial assessment suggests it may be missing required transparency "
            "disclosures under Article 50. Conduct a focused audit."
        ),
        system_name="RetailAssist Chatbot",
        system_description=(
            "General-purpose LLM-based chatbot answering customer questions about "
            "products, shipping, and returns. No biometric processing, no profiling, "
            "no high-risk decisions. Deployed on company website with __USERCOUNT__ "
            "daily users."
        ),
        system_category="limited_risk",
        deployer_info="__COMPANY__ — mid-size EU e-commerce retailer, 200 employees, operating in DE/FR/NL.",
        correct_classification="limited_risk",
        ground_truth_findings=[
            "missing_ai_disclosure",
            "no_human_escalation_option",
        ],
        required_remediation=[
            "add_ai_disclosure_banner",
            "implement_human_handoff",
        ],
        red_herrings=[
            "high_risk_automated_decision",
        ],

        # --- Investigation documents ---

        documentation_data="""\
TECHNICAL DOCUMENTATION INVENTORY
System: RetailAssist Chatbot __VERSION__
Deployer: __COMPANY__
Review Date: __DATE__

DOCUMENT STATUS:
  System Architecture Overview          Present   (updated 2025-11-03)
  Data Processing Description           Present   (updated 2025-09-22)
  Input/Output Specification            Present   (updated 2025-11-03)
  Performance Metrics Report            Present   (accuracy 94.2% on test set)
  User Interaction Guidelines           Present   (updated 2025-06-10)

ANNEX IV CROSS-REFERENCE (for limited-risk, advisory only):
  Section 1(a) Intended purpose         Documented
  Section 1(b) Deployer identification  Documented
  Section 2(a) Development methods      Documented

NOTE: Documentation is generally adequate for a limited-risk system.
The system processes no biometric data and makes no legally significant
decisions. Standard chatbot architecture with retrieval-augmented generation.

No gaps identified in core technical documentation.
The primary compliance concern for this system category relates to
Article 50 transparency obligations, not documentation completeness.""",

        training_data_info="""\
TRAINING DATA SUMMARY
System: RetailAssist Chatbot __VERSION__

Dataset: Product catalog entries + FAQ question-answer pairs
Volume: 52,847 examples (last updated 2025-10-15)

DATA COMPOSITION:
  Product descriptions         31,204 entries
  FAQ pairs                    14,892 entries
  Customer service transcripts  6,751 entries (anonymized)

Personal data in training set: None identified
  - Customer transcripts were fully anonymized before inclusion
  - No names, emails, or account numbers in training data
  - PII scrubbing verified by DPO on 2025-08-20

Bias assessment: Not formally required for limited-risk chatbot systems.
The system provides product information and does not make decisions
affecting individuals' rights or access to services.

Data governance: Adequate — data sources documented, retention policy
in place (36-month cycle), access controls implemented.""",

        oversight_info="""\
HUMAN OVERSIGHT PROCEDURES
System: RetailAssist Chatbot __VERSION__
Department: Customer Experience Team

CURRENT PROCESS:
  - Customer service team of 12 agents monitors a dashboard showing
    flagged conversations (profanity filter, sentiment < 0.3, repeat queries)
  - Approximately 8% of conversations are flagged for human review
  - Average response time for flagged conversations: 4.2 minutes
  - Team operates during business hours (08:00-20:00 CET, Mon-Sat)

ESCALATION PROCEDURE:
  The chatbot displays a generic "Was this helpful?" prompt after each
  interaction. If the user clicks "No", the chatbot offers to repeat
  the answer or try a different phrasing.

  There is no option for the user to request transfer to a human agent
  during the conversation. The "Contact Us" page exists separately on
  the website but is not linked from the chat interface.

  After business hours, flagged conversations queue until the next
  business day. No real-time human intervention is available outside
  business hours.

OVERRIDE CAPABILITY:
  Customer service agents can take over any active conversation from the
  monitoring dashboard, but this is agent-initiated, not customer-initiated.
  Customers have no mechanism to trigger human intervention directly.""",

        transparency_info="""\
TRANSPARENCY & DISCLOSURE REVIEW
System: RetailAssist Chatbot __VERSION__

USER INTERFACE ANALYSIS:
  The chatbot appears as a widget in the bottom-right corner of the
  website. The chat window header reads "RetailAssist — How can I help?"
  with a generic chat bubble icon.

  Opening message: "Hi! I'm here to help you find products, track
  orders, and answer questions about our store. What can I do for you?"

  The chat interface includes:
    - Text input field
    - Send button
    - Emoji picker
    - "Was this helpful?" feedback after each response

  SEARCHED FOR AI DISCLOSURE INDICATORS:
    - Chat header:              No mention of AI or automated system
    - Opening message:          No mention of AI or automated system
    - Website footer:           Standard legal links (Privacy, Terms, Cookies)
    - Terms of Service:         Section 7.3 mentions "automated tools may assist
                                customer service" but this is not shown to users
                                during chat interaction
    - Privacy Policy:           Section 4.1 mentions "AI-powered features" in
                                general terms but does not specifically disclose
                                the chatbot as AI
    - Chat widget tooltip:      "Chat with us" (no AI indication)
    - Accessibility statement:  No mention of AI

  Article 50(1) requires: "Providers shall ensure that AI systems intended
  to directly interact with natural persons are designed and developed in
  such a way that the natural persons concerned are informed that they are
  interacting with an AI system."

CONTENT LABELING:
  Not applicable — system generates text responses only, no deepfakes
  or synthetic media requiring Article 50(2) labeling.""",

        risk_assessment_info="""\
RISK CLASSIFICATION ANALYSIS
System: RetailAssist Chatbot __VERSION__

ANNEX III HIGH-RISK CHECK:
  1. Biometric identification               Not applicable
  2. Critical infrastructure                 Not applicable
  3. Education and vocational training       Not applicable
  4. Employment, workers management          Not applicable
  5. Access to essential services            Not applicable — provides product
     information only, no credit/insurance/social services decisions
  6. Law enforcement                         Not applicable
  7. Migration, asylum, border control       Not applicable
  8. Administration of justice               Not applicable

ARTICLE 5 PROHIBITED PRACTICES CHECK:
  Subliminal manipulation                    Not applicable
  Exploitation of vulnerabilities            Not applicable
  Social scoring                             Not applicable
  Real-time biometric identification         Not applicable

RISK LEVEL DETERMINATION: Limited Risk
  The system falls under Article 50 transparency obligations as an
  AI system designed to directly interact with natural persons.
  No high-risk or prohibited characteristics identified.

CONFORMITY ASSESSMENT: Not required for limited-risk systems.
  Voluntary code of conduct adherence recommended per Article 95.""",

        logging_info="""\
LOGGING & TRACEABILITY REVIEW
System: RetailAssist Chatbot __VERSION__

CURRENT LOGGING:
  Conversation logs:     Retained for 30 days, then anonymized + archived
  Model version:         Tracked per deployment (current: __VERSION__)
  User feedback:         Stored with conversation ID
  System health metrics: Uptime, response latency, error rate

AUDIT TRAIL:
  Each conversation has a unique ID linking:
    - User session (anonymized)
    - Model version used
    - All messages in sequence
    - Feedback received
    - Any flags triggered

  Logs are adequate for a limited-risk system. Article 12 automatic
  logging requirements apply primarily to high-risk systems.
  Current logging exceeds what is required for the system's risk level.""",
    )

    # Unique graph: only transparency and oversight are in the investigation path
    s.graph = _build_scenario_graph(
        investigation_tools=["check_transparency", "verify_human_oversight"],
    )
    return s


def _easy_recommendation() -> AuditScenario:
    s = AuditScenario(
        scenario_id="easy_recommendation_minimal_001",
        title="Music Recommendation Engine — Minimal Risk Verification",
        difficulty="easy",
        description=(
            "A music streaming platform's recommendation algorithm needs EU AI Act "
            "classification. The system suggests songs based on listening history. "
            "Stakeholders believe it's minimal risk but need formal verification."
        ),
        system_name="MelodyMatch Algorithm",
        system_description=(
            "Collaborative filtering recommendation engine for music. Suggests songs "
            "based on listening patterns. No personal profiling beyond music preferences. "
            "No decisions affecting legal rights. Pure content personalization for "
            "__USERCOUNT__ EU users."
        ),
        system_category="minimal_risk",
        deployer_info="__COMPANY__ — Swedish music streaming startup, 50 employees.",
        correct_classification="minimal_risk",
        ground_truth_findings=[
            "voluntary_code_of_conduct_recommended",
        ],
        required_remediation=[
            "adopt_voluntary_transparency_code",
        ],
        red_herrings=[
            "gdpr_violation",
            "high_risk_profiling",
        ],

        documentation_data="""\
TECHNICAL DOCUMENTATION INVENTORY
System: MelodyMatch Algorithm __VERSION__
Deployer: __COMPANY__

DOCUMENT STATUS:
  System Architecture                   Present   (hybrid collaborative filtering)
  Algorithm Description                 Present   (item-item CF + content embeddings)
  Data Pipeline Documentation           Present   (Spark ETL pipeline)
  Performance Metrics                   Present   (hit@10: 0.342, NDCG: 0.281)
  API Documentation                     Present   (REST API for mobile/web clients)

All core technical documents are present and current.
The system is a standard recommendation engine with no novel or
experimental components requiring additional documentation.""",

        training_data_info="""\
TRAINING DATA SUMMARY
System: MelodyMatch Algorithm __VERSION__

Dataset: Anonymized listening history from __USERCOUNT__ users
Volume: 10.3M user-song interactions (2023-2025)

DATA COMPOSITION:
  Interaction types:    Play, skip, save, playlist-add
  User features:        Pseudonymized user ID, country, subscription tier
  Song features:        Genre, tempo, energy, valence, artist, release year

Personal data assessment:
  - User IDs are pseudonymized (SHA-256 hash, no reversal possible)
  - No names, emails, or demographic data in training set
  - Country used for regional catalog filtering only
  - GDPR Article 6(1)(f) legitimate interest basis documented

Bias considerations:
  Music recommendations do not involve protected characteristics.
  Popularity bias exists (mainstream content recommended more often)
  but this does not constitute discrimination under the AI Act.
  No individuals are disadvantaged in access to services or rights.

Note: Some stakeholders raised concerns about "profiling" under GDPR.
Music taste profiling for recommendations is distinct from profiling
under the AI Act which concerns evaluation of personal aspects
affecting natural persons' rights. Recommendation of entertainment
content does not qualify as high-risk profiling.""",

        oversight_info="""\
HUMAN OVERSIGHT ASSESSMENT
System: MelodyMatch Algorithm __VERSION__

The EU AI Act does not impose specific human oversight requirements
on minimal-risk AI systems. Article 14 human oversight obligations
apply to high-risk AI systems only.

Current oversight measures (voluntary):
  - Product team reviews recommendation quality metrics weekly
  - A/B testing framework validates algorithm changes before rollout
  - Content moderation team can flag/remove specific recommendations
  - Users can provide feedback ("Don't recommend this") per track""",

        transparency_info="""\
TRANSPARENCY REVIEW
System: MelodyMatch Algorithm __VERSION__

USER-FACING DISCLOSURES:
  The app includes a "How recommendations work" info page accessible
  from Settings > Privacy > Personalization, which states:

  "MelodyMatch uses your listening history to suggest music you might
  enjoy. Our algorithm analyzes patterns in what you play, skip, and
  save to find similar tracks and artists."

  Recommendations are labeled "Recommended for you" and "Based on
  your recent listening" in the UI.

  Article 50 transparency obligations for systems interacting directly
  with natural persons apply to chatbots and emotion recognition, not
  to content recommendation systems. No mandatory transparency
  obligations identified for this system category.

  Voluntary transparency: The deployer has voluntarily disclosed the
  algorithmic nature of recommendations, which aligns with Article 95
  (voluntary codes of conduct for non-high-risk AI).""",

        risk_assessment_info="""\
RISK CLASSIFICATION ANALYSIS
System: MelodyMatch Algorithm __VERSION__

ANNEX III HIGH-RISK CHECK:
  1. Biometric identification               Not applicable
  2. Critical infrastructure                 Not applicable
  3. Education and vocational training       Not applicable
  4. Employment, workers management          Not applicable
  5. Access to essential services            Not applicable
  6. Law enforcement                         Not applicable
  7. Migration, asylum, border control       Not applicable
  8. Administration of justice               Not applicable

ARTICLE 5 PROHIBITED PRACTICES:
  None applicable — content recommendation is not subliminal
  manipulation, social scoring, or biometric identification.

RISK LEVEL DETERMINATION: Minimal Risk
  The system recommends entertainment content. It does not make
  decisions affecting natural persons' rights, access to services,
  or legal status. No Annex III category applies.

  Under Article 95, deployers of minimal-risk AI systems are
  encouraged to voluntarily apply codes of conduct for transparency
  and trustworthiness, but this is not mandatory.""",

        logging_info="""\
LOGGING REVIEW
System: MelodyMatch Algorithm __VERSION__

Article 12 automatic logging requirements apply to high-risk AI
systems. This minimal-risk system is not subject to those obligations.

Current logging (voluntary):
  Model version tracking:    Yes (per deployment)
  Recommendation logs:       Retained 90 days
  A/B test results:          Archived indefinitely
  User feedback:             Linked to recommendation events

Logging is adequate and exceeds requirements for the risk level.""",
    )

    # Minimal graph — mainly just classification verification
    s.graph = _build_scenario_graph(
        investigation_tools=["check_transparency", "assess_risk_management"],
    )
    return s


# ---------------------------------------------------------------------------
# MEDIUM SCENARIOS
# ---------------------------------------------------------------------------

def _medium_hiring() -> AuditScenario:
    s = AuditScenario(
        scenario_id="medium_hiring_bias_001",
        title="AI Resume Screener — High-Risk Employment System Audit",
        difficulty="medium",
        description=(
            "A recruitment platform uses AI to screen and rank job applicants. "
            "This falls under Annex III (employment, workers management) making it "
            "a candidate for HIGH-RISK classification. Reports suggest potential "
            "gender bias in technical role screening. Full compliance audit required."
        ),
        system_name="TalentRank AI Screener",
        system_description=(
            "NLP-based resume screening system that scores and ranks applicants for "
            "job openings. Processes name, education, work history, skills. Used by "
            "500+ EU companies for initial candidate filtering. Automated decisions "
            "affect who gets interviews. Deployed in __REGION__ with __USERCOUNT__ "
            "applicants processed to date."
        ),
        system_category="high_risk",
        deployer_info="__COMPANY__ — French HR-tech company, 300 employees, serving 500+ EU enterprise clients.",
        correct_classification="high_risk",
        ground_truth_findings=[
            "gender_bias_in_technical_screening",
            "insufficient_human_oversight",
            "incomplete_technical_documentation",
            "missing_fundamental_rights_impact_assessment",
            "inadequate_data_governance",
        ],
        required_remediation=[
            "conduct_bias_audit_and_mitigation",
            "implement_human_review_for_all_rejections",
            "complete_annex_iv_technical_documentation",
            "perform_fundamental_rights_impact_assessment",
            "establish_data_governance_framework",
        ],
        red_herrings=[
            "prohibited_social_scoring",
            "biometric_processing",
        ],

        documentation_data="""\
TECHNICAL DOCUMENTATION INVENTORY
System: TalentRank AI Screener __VERSION__
Deployer: __COMPANY__
Audit Date: __DATE__

DOCUMENT STATUS MATRIX:
  Document                                  Status     Last Updated
  ─────────────────────────────────────────────────────────────────
  System Architecture Overview              Present    2025-11-03
  Algorithm Description (NLP Pipeline)      Present    2025-09-22
  Input Data Specification                  Present    2025-11-03
  Output Specification                      Present    2025-11-03
  Performance Metrics Report                Absent     —
  Accuracy by Demographic Group Report      Absent     —
  Data Governance Policy                    Absent     —
  Fundamental Rights Impact Assessment      Absent     —
  Risk Management Plan                      Present    2024-08-15
  Post-Market Monitoring Plan               Draft      2025-12-01
  User Instructions (Article 13)            Partial    2025-06-10
  Change Management Log                     Present    2025-11-28

EU AI ACT ANNEX IV CROSS-REFERENCE:
  Section 1 — General Description
    (a) Intended purpose                       Documented
    (b) Deployer name and contact              Documented
    (c) Version and update history             Documented

  Section 2 — Detailed Description
    (a) Development methods and process        Documented
    (b) Design specifications and rationale    Not documented — no architecture
        diagrams for the scoring pipeline
    (c) Data requirements and provenance       Partial — data sources listed but
        no data governance policy document
    (d) Metrics and performance benchmarks     Not documented — no disaggregated
        performance metrics by demographic group
    (e) Computational resources                Documented
    (f) Expected lifetime and maintenance      Documented
    (g) Quality management procedures          Not documented

  Section 3 — Monitoring and Reporting
    (a) Post-market monitoring plan            Draft only — not finalized
    (b) Logging requirements specification     Documented

OBSERVATIONS:
  The Risk Management Plan was last updated 18 months ago. It does not
  reference the EU AI Act, harmonised standards, or Article 9 requirements.
  The Post-Market Monitoring Plan remains in draft status with no target
  completion date established.
  Article 13 user instructions do not include information about the system's
  accuracy levels, known limitations, or demographic performance variation.
  No Fundamental Rights Impact Assessment has been conducted.""",

        training_data_info="""\
TRAINING DATA AUDIT REPORT
System: TalentRank AI Screener __VERSION__
Dataset: Historical hiring decisions from EU enterprise clients
Records: 2,147,832 applications (2019-2025)
Last model retrain: 2025-10-15

DEMOGRAPHIC DISTRIBUTION OF TRAINING DATA:
  Category                Count        Pct of Total
  ────────────────────────────────────────────────
  Male applicants         1,342,104    62.5%
  Female applicants         805,728    37.5%
  Age 18-30                 751,741    35.0%
  Age 31-45                 923,168    43.0%
  Age 46-65                 472,923    22.0%
  Western EU origin       1,567,516    73.0%
  Eastern EU origin         580,316    27.0%

CALLBACK RATES BY DEMOGRAPHIC (Technical Roles Only):
  Group                  Applications   Callbacks   Rate     vs Baseline
  ──────────────────────────────────────────────────────────────────────
  Male applicants           487,204      166,624    34.2%    (baseline)
  Female applicants         198,731       52,266    26.3%    -23.1%
  Age 18-30                 241,367       91,961    38.1%    +11.4%
  Age 31-45                 298,422      100,927    33.8%     -1.2%
  Age 46-65                 146,146       41,924    28.7%    -16.1%
  Western EU                512,489      179,884    35.1%     +2.6%
  Eastern EU                173,446       47,547    27.4%    -19.9%

  Note: Callback rate = percentage of applicants advanced to interview stage.
  Baseline = overall male callback rate for technical roles.

DATA SOURCES:
  Source                          Records    Period       Documented
  ────────────────────────────────────────────────────────────────
  Enterprise client ATS exports   1,847,291  2019-2025   Yes
  Partner recruiting platforms      203,114  2021-2025   Yes
  Internal test applications         97,427  2024-2025   Yes

DATA GOVERNANCE:
  Data retention policy:           Present (5-year retention)
  Source documentation:            47 of 52 sources documented
  Personal data inventory:         Names, education, work history, skills
  Consent mechanism:               Employer-level agreements in place;
                                   individual applicant consent not obtained
  Article 10 specific provisions:  Not addressed in current data governance
  Data quality assessment:         Last conducted 2024-06-12

ADDITIONAL CONTEXT:
  A separate fraud detection module shares the same data lake infrastructure
  but operates independently with its own pipeline. The fraud detection
  module passed its most recent bias audit with no findings (2025-09-01).
  The fraud system's compliance status has no bearing on TalentRank's
  compliance obligations.""",

        oversight_info="""\
HUMAN OVERSIGHT PROCEDURES
System: TalentRank AI Screener __VERSION__
Department: Client Success & Quality Assurance

REVIEW PROCESS:
  TalentRank's screening pipeline processes applications in three stages:

  Stage 1 — Automated Screening (AI):
    All incoming applications are scored by the AI model (0-100).
    Applications scoring below the client-configured threshold (default: 40)
    are automatically marked as "Not Progressed."

  Stage 2 — Quality Sampling:
    The QA team reviews a random sample of screened applications.
    Current sampling rate: 5% of "Not Progressed" decisions.
    Sampling is conducted weekly in batch.

  Stage 3 — Client Review:
    Applications marked "Progressed" are presented to the hiring manager
    for final interview selection. Clients may also view "Not Progressed"
    applications if they choose, but fewer than 2% of clients do so.

REVIEW STATISTICS (Q4 2025):
  Applications processed:         347,291
  Automatically rejected:         208,375  (60.0%)
  QA sample reviewed:              10,419  (5.0% of rejections)
  QA overrides (rejection → pass):    312  (3.0% of samples)
  Client-initiated reviews:          4,166  (2.0% of clients)

OVERRIDE CAPABILITY:
  Both QA staff and client hiring managers can override any AI decision.
  The override interface is accessible from the application dashboard.
  However, the system does not proactively flag borderline cases or
  indicate confidence scores to reviewers.

MONITORING:
  No ongoing bias monitoring system is in place. The QA sampling is
  focused on general quality, not demographic fairness. No automated
  alerts exist for drift in rejection rates across demographic groups.""",

        transparency_info="""\
TRANSPARENCY & USER NOTIFICATION REVIEW
System: TalentRank AI Screener __VERSION__

APPLICANT-FACING COMMUNICATIONS:
  At the time of application, candidates see the following notice in
  the application portal footer (8pt font, light gray text):

    "By submitting your application, you agree that your information may
    be processed using automated tools to assist in the evaluation process."

  No further information is provided about:
    - The specific role of AI in screening decisions
    - The logic involved in the automated processing
    - The significance and envisaged consequences for the applicant
    - The applicant's right to obtain human intervention
    - The applicant's right to contest the decision

  Rejection notifications are sent via email with the text:
    "After careful review, we have decided not to progress your
    application at this time. We wish you the best in your search."

  No mention is made that the decision was automated or that
  AI was involved in the screening process.

DEPLOYER-FACING (CLIENT) INFORMATION:
  Client onboarding materials describe TalentRank as an "AI-powered
  screening solution" with "proprietary NLP scoring." Clients receive
  a product sheet with overall accuracy metrics (precision: 0.82,
  recall: 0.71) but no demographic disaggregation.

RIGHT TO EXPLANATION:
  No mechanism exists for applicants to request an explanation of
  how the AI arrived at its scoring decision. The company's privacy
  policy references GDPR Article 22 but states "meaningful human
  involvement exists in the hiring process" without specifying the
  extent of that involvement.""",

        risk_assessment_info="""\
RISK MANAGEMENT & CONFORMITY ASSESSMENT
System: TalentRank AI Screener __VERSION__

ANNEX III CLASSIFICATION:
  Category 4 — Employment, workers management and access to self-employment
  Sub-category: AI systems intended to be used for recruitment or selection
  of natural persons, for making decisions affecting terms of work-related
  relationships, or for task allocation based on individual behavior.

  This system screens and ranks job applicants. It directly affects which
  candidates are progressed to interview, constituting a decision that
  impacts access to employment.

CONFORMITY ASSESSMENT STATUS:
  Internal conformity assessment (Article 43):     Not initiated
  Quality management system (Article 17):          Basic framework exists
                                                   but does not address
                                                   AI-specific requirements
  EU Declaration of Conformity (Article 47):       Not filed
  CE marking (Article 48):                         Not applied

RISK MANAGEMENT SYSTEM (Article 9):
  A risk management plan was created in August 2024, prior to the
  EU AI Act application date. The plan covers general software risks
  (availability, data integrity) but does not address:
    - AI-specific risks (bias, drift, adversarial inputs)
    - Residual risk assessment methodology
    - Risk control measures for demographic fairness
    - Foreseeable misuse scenarios

POST-MARKET MONITORING (Article 72):
  A post-market monitoring plan is in draft status. It outlines
  monitoring of system uptime and client satisfaction scores.
  It does not include:
    - Performance monitoring by demographic group
    - Bias drift detection mechanisms
    - Incident reporting procedures to national authorities
    - Criteria for triggering corrective action""",

        logging_info="""\
AUTOMATIC LOGGING & TRACEABILITY REVIEW
System: TalentRank AI Screener __VERSION__

CURRENT LOGGING IMPLEMENTATION:
  Event Type              Logged    Format      Retention
  ────────────────────────────────────────────────────────
  Application received    Yes       JSON        5 years
  AI score generated      Yes       Numeric     5 years
  Threshold decision      Yes       Boolean     5 years
  Input features used     No        —           —
  Model version           Yes       String      Indefinite
  QA review action        Yes       JSON        5 years
  Client override         Yes       JSON        5 years
  Score explanation        No        —           —

ARTICLE 12 REQUIREMENTS CHECK:
  Article 12(1) requires high-risk AI systems to include logging
  capabilities that enable recording of events relevant to:

  (a) Identifying situations that may result in risks:
      Partially met — threshold decisions logged, but no logging
      of cases where the model's confidence was low or where
      the decision was borderline.

  (b) Facilitating post-market monitoring:
      Not met — input features are not logged, making it impossible
      to reconstruct why a particular score was assigned or to
      detect feature drift over time.

  (c) Ensuring traceability of the AI system's functioning:
      Partially met — model version and scores are tracked but
      input-output pairs are not linked, preventing full
      reproducibility of decisions.

IDENTIFIED GAPS:
  1. Input features are not logged alongside output scores
  2. No confidence/uncertainty indicators are recorded
  3. Logs do not support demographic performance analysis
  4. No automated alerting on logging failures""",

        # --- Deep-dive documents (revealed on repeat tool calls) ---

        deep_training_data="""\
DEEP DIVE: TRAINING DATA FORENSIC ANALYSIS
Re-investigation of TalentRank AI Screener __VERSION__

INTERSECTIONAL ANALYSIS (not available in initial audit):
  Group                        Applications   Callbacks   Rate
  ────────────────────────────────────────────────────────────
  Female + Eastern EU             41,287        8,669    21.0%
  Female + Age 46-65              28,904        6,647    23.0%
  Male + Western EU              398,721      147,527    37.0%
  Male + Age 18-30               189,234       75,694    40.0%

  Intersectional analysis reveals compounding bias: female applicants
  from Eastern EU have the lowest callback rate at 21.0% — a 38.6%
  gap versus the highest-performing group (young Western EU males).

FEATURE IMPORTANCE ANALYSIS (extracted from model internals):
  Top 10 predictive features for "Not Progressed" decision:
    1. years_experience_gap      (0.142)  — penalizes career gaps
    2. education_prestige_score  (0.098)  — biased toward Western EU universities
    3. keyword_density_technical (0.087)  — favors specific technical jargon
    4. name_encoding_cluster     (0.076)  — CONCERNING: name-derived feature
    5. employment_continuity     (0.071)  — penalizes parental leave gaps
    6. skills_match_score        (0.065)
    7. recency_weighted_exp      (0.058)
    8. industry_match            (0.052)
    9. location_cluster          (0.048)  — correlates with Eastern/Western EU
   10. application_completeness  (0.041)

  Features #1, #4, #5, and #9 have documented correlations with
  protected characteristics (gender, ethnicity, national origin).
  Feature #4 (name_encoding_cluster) appears to encode ethnic origin.""",

        deep_oversight="""\
DEEP DIVE: HUMAN OVERSIGHT FORENSIC ANALYSIS
Re-investigation of review process effectiveness

QA OVERRIDE ANALYSIS (detailed breakdown):
  Of the 312 QA overrides in Q4 2025:
    Female applicants overridden to pass:  187  (60.0%)
    Male applicants overridden to pass:    125  (40.0%)

  This suggests QA reviewers are catching gender bias in the
  AI decisions — but only for the 5% sample they review.
  The remaining 95% of automated rejections are not corrected.

ESTIMATED IMPACT:
  If the QA override rate (3%) applied to ALL automated rejections:
    Estimated wrongful rejections:  208,375 * 0.03 = ~6,251 applicants
    Of which female:                ~3,751 (based on override demographics)

  These applicants were automatically rejected without human review
  and had no mechanism to request reconsideration.

CLIENT AWARENESS:
  Survey of 50 client hiring managers (2025-Q4):
    - 78% were unaware that AI makes the initial screening decision
    - 92% believed "a recruiter reviews every application"
    - 0% knew the QA sampling rate was only 5%""",

        deep_documentation="""\
DEEP DIVE: DOCUMENTATION GAP ANALYSIS
Detailed Annex IV compliance assessment

SECTION 2(d) — PERFORMANCE METRICS:
  The published model card reports aggregate metrics:
    Precision: 0.82, Recall: 0.71, F1: 0.76

  These metrics are NOT disaggregated by any protected characteristic.
  Article 9(7) requires: "Testing shall be made against specific
  metrics and probabilistic thresholds that are appropriate to the
  intended purpose of the high-risk AI system."

  No per-group metrics exist for: gender, age, nationality, ethnicity.

SECTION 2(b) — DESIGN SPECIFICATIONS:
  The scoring pipeline processes resumes through 4 stages:
    1. Text extraction (OCR + NLP parsing)
    2. Feature engineering (218 features)
    3. Scoring model (XGBoost ensemble)
    4. Threshold comparison (client-configurable)

  Stage 2 (feature engineering) is undocumented. No specification
  explains how features like "name_encoding_cluster" or
  "education_prestige_score" are computed.""",
    )

    # Full investigation path for high-risk system
    s.graph = _build_scenario_graph(
        investigation_tools=[
            "check_documentation", "audit_training_data",
            "verify_human_oversight", "check_transparency",
            "assess_risk_management", "check_logging",
        ],
    )
    return s


def _medium_credit() -> AuditScenario:
    s = AuditScenario(
        scenario_id="medium_credit_scoring_001",
        title="Credit Scoring Model — Financial Services Compliance",
        difficulty="medium",
        description=(
            "A fintech company's AI credit scoring model is under review. The model "
            "determines creditworthiness for consumer loans up to EUR 50,000. "
            "Full compliance audit required under EU AI Act high-risk provisions."
        ),
        system_name="CreditFlow AI Score",
        system_description=(
            "Gradient-boosted ensemble model scoring creditworthiness using 200+ "
            "features from credit bureau data, transaction history, and alternative "
            "data sources. Automated decisions for loans up to EUR 50K. Serving "
            "__USERCOUNT__ active users across NL/DE/FR."
        ),
        system_category="high_risk",
        deployer_info="__COMPANY__ — Dutch fintech, 150 employees, licensed in NL/DE/FR.",
        correct_classification="high_risk",
        ground_truth_findings=[
            "opaque_feature_importance",
            "alternative_data_bias_risk",
            "no_right_to_human_review",
            "missing_conformity_assessment",
        ],
        required_remediation=[
            "implement_explainability_module",
            "remove_or_audit_alternative_data_sources",
            "add_human_review_for_rejections",
            "complete_conformity_assessment",
        ],
        red_herrings=[
            "fraud_detection_non_compliant",
            "gdpr_data_breach",
        ],

        documentation_data="""\
TECHNICAL DOCUMENTATION INVENTORY
System: CreditFlow AI Score __VERSION__
Deployer: __COMPANY__
Audit Date: __DATE__

DOCUMENT STATUS:
  Document                                  Status     Last Updated
  ─────────────────────────────────────────────────────────────────
  System Architecture                       Present    2025-10-12
  Model Card                                Present    2025-08-30
  Feature Engineering Pipeline              Present    2025-10-12
  Performance Metrics (Aggregate)           Present    2025-11-01
  Performance Metrics (Disaggregated)       Absent     —
  Explainability Documentation              Absent     —
  Data Source Inventory                     Present    2025-07-18
  Conformity Declaration                    Absent     —
  User Instructions                         Present    2025-09-05

MODEL CARD SUMMARY (from published card):
  Model type:        Gradient-boosted ensemble (XGBoost)
  Features:          218 input features
  Target:            Probability of default within 12 months
  AUC-ROC:           0.847 (test set)
  Gini coefficient:  0.694 (test set)

  The model card lists aggregate performance metrics but does not
  include performance breakdowns by age group, gender, nationality,
  or income bracket. Feature importance rankings are described as
  "proprietary" and not included in the published card.

ANNEX IV GAPS:
  Section 2(b) — Design specifications:  No documentation explaining
    why alternative data sources (social media sentiment, device metadata)
    were included as features or their impact on model decisions.
  Section 2(d) — Performance metrics:  No demographic disaggregation.
  Section 2(g) — Quality management:   Referenced but links to
    outdated ISO 27001 procedures, not AI-specific QMS.""",

        training_data_info="""\
TRAINING DATA AUDIT REPORT
System: CreditFlow AI Score __VERSION__
Dataset: Loan applications and outcomes (2019-2025)
Records: 3,217,445 applications

FEATURE CATEGORIES:
  Category                  Features   Source
  ──────────────────────────────────────────────────
  Credit bureau data           42      TransUnion, Experian
  Transaction history          67      Banking API aggregator
  Application data             31      Direct from applicant
  Alternative data             78      See breakdown below

ALTERNATIVE DATA BREAKDOWN:
  Feature Group               Count   Source
  ──────────────────────────────────────────────────
  Device metadata               23    Browser/mobile fingerprint
  Social media sentiment        18    LinkedIn, public profiles
  Location signals              12    IP geolocation, check-in
  App usage patterns            15    Installed apps, usage freq
  Email domain analysis         10    Provider reputation scoring

  Alternative data features were added in v3.8 to improve prediction
  for "thin-file" applicants lacking traditional credit history.
  Internal validation showed +3.2% AUC improvement.

  No bias assessment has been conducted specifically for alternative
  data features. Academic literature suggests device metadata and
  social media signals can correlate with protected characteristics
  including race, income, and education level.

LOAN OUTCOMES BY APPLICANT PROFILE (Approval Rates):
  Age Group        Applications   Approved     Rate
  ────────────────────────────────────────────────
  18-25               482,617     168,916    35.0%
  26-35             1,029,582     586,862    57.0%
  36-50             1,061,753     657,287    61.9%
  51-65               504,930     277,712    55.0%
  65+                 138,563      55,425    40.0%

ADDITIONAL CONTEXT:
  The company also operates a separate fraud detection system that
  uses rule-based heuristics (not ML). This system was audited
  independently in 2025-Q3 and found compliant with applicable
  regulations. The fraud system does not share models with CreditFlow.""",

        oversight_info="""\
HUMAN OVERSIGHT PROCEDURES
System: CreditFlow AI Score __VERSION__

DECISION WORKFLOW:
  Loan applications are processed as follows:

  1. Applicant submits online application
  2. CreditFlow AI generates creditworthiness score (0-1000)
  3. Score is compared against risk threshold:
     - Score >= 650: Automatically approved (up to EUR 25K)
     - Score 450-649: Queued for human review
     - Score < 450: Automatically declined

  For loans EUR 25K-50K, all applications require human review
  regardless of AI score.

REVIEW STATISTICS (2025):
  Total applications:              892,456
  Auto-approved (< EUR 25K):       401,605  (45.0%)
  Auto-declined:                   223,114  (25.0%)
  Human-reviewed:                  267,737  (30.0%)

  Of auto-declined applications:
    Appealed by applicant:           8,924  (4.0%)
    Appeal reviewed by human:        8,924  (100% of appeals)
    Appeal overturned:               1,338  (15.0% of appeals)

  Note: Applicants must actively submit an appeal through a form
  linked in the rejection email. The appeal process is described
  in FAQ section 7 of the website (3 clicks from homepage).

HUMAN REVIEWER TOOLS:
  Reviewers see the AI score and top-5 contributing features but
  no full explanation of the model's reasoning. The reviewer
  interface does not highlight cases where the model's confidence
  is low or where protected characteristics may be influencing
  the outcome.""",

        transparency_info="""\
TRANSPARENCY REVIEW
System: CreditFlow AI Score __VERSION__

APPLICANT NOTIFICATIONS:
  Application form includes the following notice:

    "Your application will be assessed using automated decision-making
    systems. You have the right to request human review of any
    automated decision."

  Rejection email text:
    "Based on our assessment, we are unable to offer you a loan at
    this time. If you wish to understand the main factors behind this
    decision or request a manual review, please contact our support
    team or visit [link]."

  The rejection email links to a generic FAQ page. The FAQ states
  that decisions are made using "a combination of credit history,
  financial data, and statistical models" but does not mention
  alternative data sources (social media, device metadata).

RIGHT TO EXPLANATION:
  Applicants can request an explanation by contacting support.
  Support agents provide a templated response listing the top 3
  general factors (e.g., "credit history length," "income level,"
  "existing debt") without specifying which exact features or
  thresholds drove the specific decision.

  No individualized explanation is generated. The support team
  does not have access to the model's per-application feature
  importance breakdown.""",

        risk_assessment_info="""\
RISK MANAGEMENT & CONFORMITY ASSESSMENT
System: CreditFlow AI Score __VERSION__

ANNEX III CLASSIFICATION:
  Category 5(b) — AI systems intended to be used to evaluate the
  creditworthiness of natural persons.

  This system directly determines loan approval/rejection for
  consumer credit up to EUR 50,000. It falls squarely within the
  high-risk category.

CONFORMITY ASSESSMENT STATUS:
  Internal conformity assessment:    Not initiated
  Quality management system:         ISO 27001 certified but no
                                     AI-specific QMS per Article 17
  EU Declaration of Conformity:      Not filed
  CE marking:                        Not applied

DORA ALIGNMENT:
  As a financial services firm, the company has ICT risk management
  procedures under DORA (Digital Operational Resilience Act). These
  cover general IT risks but do not address AI-specific risks
  such as model drift, adversarial inputs, or algorithmic bias.

RISK MANAGEMENT GAPS:
  The company's risk register includes traditional financial risks
  (credit risk, market risk, operational risk) but does not include:
    - AI model risk as a distinct risk category
    - Bias risk for protected characteristics
    - Alternative data correlation risk
    - Drift detection requirements""",

        logging_info="""\
LOGGING & TRACEABILITY REVIEW
System: CreditFlow AI Score __VERSION__

CURRENT LOGGING:
  Event Type               Logged   Format     Retention
  ─────────────────────────────────────────────────────
  Application received     Yes      JSON       7 years
  Score generated          Yes      Numeric    7 years
  Decision (approve/deny)  Yes      String     7 years
  Features used            Yes      Vector     7 years
  Model version            Yes      String     Indefinite
  Human review action      Yes      JSON       7 years
  Appeal and outcome       Yes      JSON       7 years

  Logging is comprehensive with input-output linkage. All decisions
  can be reconstructed. Feature vectors are stored alongside scores,
  enabling retrospective audits.

  Financial services retention requirements (7 years) are met.
  Article 12 logging requirements for high-risk AI appear to be
  adequately addressed at the technical level.

  NOTE: While individual decision logs are complete, there is no
  automated aggregation or monitoring dashboard that tracks
  demographic patterns across decisions over time.""",
    )

    s.graph = _build_scenario_graph(
        investigation_tools=[
            "check_documentation", "audit_training_data",
            "verify_human_oversight", "check_transparency",
            "assess_risk_management",
        ],
    )
    return s


def _medium_medical() -> AuditScenario:
    s = AuditScenario(
        scenario_id="medium_medical_triage_001",
        title="Emergency Triage AI — Medical Device Compliance",
        difficulty="medium",
        description=(
            "A hospital network deployed an AI system that prioritizes emergency "
            "department patients based on vital signs and symptoms. As a medical "
            "device with AI, it falls under both the EU AI Act (high-risk, Annex III) "
            "and the Medical Devices Regulation (MDR). Audit required."
        ),
        system_name="TriageAI Priority System",
        system_description=(
            "ML model processing vital signs (heart rate, BP, SpO2, temperature), "
            "symptoms, and medical history to assign emergency triage priority "
            "(ESI levels 1-5). Used in 12 EU hospitals across DE/AT/CH. Decisions "
            "directly affect patient care timing."
        ),
        system_category="high_risk",
        deployer_info="__COMPANY__ — German health-tech company, certified under MDR, deployed in DE/AT/CH hospitals.",
        correct_classification="high_risk",
        ground_truth_findings=[
            "insufficient_clinical_validation",
            "age_bias_in_triage_scoring",
            "no_real_time_performance_monitoring",
            "missing_post_market_surveillance",
        ],
        required_remediation=[
            "conduct_prospective_clinical_validation_study",
            "audit_age_related_bias_and_recalibrate",
            "implement_real_time_performance_dashboard",
            "establish_post_market_surveillance_plan",
        ],
        red_herrings=[
            "prohibited_system",
            "missing_ce_marking",
        ],

        documentation_data="""\
TECHNICAL DOCUMENTATION INVENTORY
System: TriageAI Priority System __VERSION__
Deployer: __COMPANY__
Audit Date: __DATE__

DOCUMENT STATUS:
  Document                                  Status     Last Updated
  ─────────────────────────────────────────────────────────────────
  System Architecture                       Present    2025-09-15
  Clinical Evaluation Report                Present    2025-03-20
  MDR Technical Documentation               Present    2025-09-15
  Intended Purpose Statement                Present    2025-09-15
  Software Life Cycle Documentation         Present    2025-11-01
  AI Act Annex IV Documentation             Absent     —
  Post-Market Clinical Follow-up Plan       Absent     —
  Post-Market Surveillance Plan (AI)        Absent     —

MDR CONFORMITY:
  CE marking:          Applied (Class IIa medical device)
  Notified Body:       BSI Group (NB 0086)
  Last MDR audit:      2025-06-12 — no non-conformities

  The system has valid MDR conformity assessment. However, the
  EU AI Act imposes ADDITIONAL requirements beyond MDR compliance
  for AI-enabled medical devices classified as high-risk under
  Annex III.

CLINICAL EVALUATION:
  The Clinical Evaluation Report (CER) is based on:
    - Retrospective analysis of 500K historical ER visits
    - Literature review of 23 published studies on AI triage
    - No prospective clinical trial has been conducted
    - CER does not address AI-specific performance degradation
      (concept drift, distribution shift between hospitals)

  NOTE: MDR clinical evaluation accepted the retrospective analysis.
  The EU AI Act may require additional validation demonstrating
  real-world performance across deployment sites.""",

        training_data_info="""\
TRAINING DATA AUDIT REPORT
System: TriageAI Priority System __VERSION__
Dataset: Historical ER visit records from 3 university hospitals
Records: 512,847 patient encounters (2018-2024)

DEMOGRAPHIC DISTRIBUTION:
  Category               Count      Pct     ESI 1-2 Rate
  ──────────────────────────────────────────────────────────
  Age 0-17               71,799    14.0%    8.2%
  Age 18-44             179,497    35.0%    6.1%
  Age 45-64             143,597    28.0%    9.3%
  Age 65-74              76,927    15.0%    14.7%
  Age 75+                41,027     8.0%    19.8%

MODEL PERFORMANCE BY AGE GROUP (ESI Classification Accuracy):
  Age Group     Accuracy    Sensitivity(ESI 1-2)    Specificity
  ──────────────────────────────────────────────────────────────
  0-17          91.3%       89.1%                   92.0%
  18-44         93.7%       91.8%                   94.2%
  45-64         92.1%       90.4%                   93.1%
  65-74         88.4%       84.2%                   90.7%
  75+           82.6%       76.3%                   85.8%

  Performance degrades notably for patients aged 75+. Sensitivity
  for the highest-acuity patients (ESI 1-2) drops to 76.3% for
  the elderly cohort — meaning 23.7% of critical elderly patients
  may be under-triaged.

TRAINING DATA COMPOSITION:
  Patients aged 75+ represent 8.0% of the training data but 19.8%
  of ESI 1-2 presentations. The model was predominantly trained on
  younger demographics.

  Data from 3 hospitals in Germany only. No Austrian or Swiss
  patient data despite deployment in AT/CH hospitals.

CLINICAL VALIDATION:
  Validation approach:  Retrospective holdout (80/20 split)
  No prospective trial conducted.
  No external validation on data from deployment hospitals.
  No assessment of performance variation across deployment sites.

NOTE: The system holds valid CE marking under MDR as a Class IIa
device. MDR conformity does not exempt from AI Act requirements.""",

        oversight_info="""\
HUMAN OVERSIGHT PROCEDURES
System: TriageAI Priority System __VERSION__
Department: Emergency Department Operations

CLINICAL WORKFLOW:
  1. Patient arrives at ER and is registered at reception
  2. Initial vitals collected by triage nurse (HR, BP, SpO2, temp)
  3. Nurse enters symptoms and relevant history into the system
  4. TriageAI generates ESI level recommendation (1-5)
  5. Triage nurse reviews and can accept or override the AI suggestion
  6. Patient is directed to appropriate care area

OVERRIDE STATISTICS (2025 Q3-Q4, across all 12 hospitals):
  Total triage assessments:       187,423
  AI recommendations accepted:    171,577   (91.5%)
  Nurse overrides:                 15,846   (8.5%)
    Override to higher acuity:      9,508   (60.0% of overrides)
    Override to lower acuity:       6,338   (40.0% of overrides)

  By ESI level:
    ESI 1 (resuscitation):  All reviewed by attending physician
    ESI 2 (emergent):       Nurse review + attending notification
    ESI 3 (urgent):         Nurse review only
    ESI 4 (less urgent):    Nurse review only
    ESI 5 (non-urgent):     Nurse review only

  The system does not flag cases where its confidence is low.
  There is no visual indicator distinguishing high-confidence from
  borderline recommendations. Nurses report in surveys that they
  tend to "trust the system" unless the recommendation is clearly
  at odds with their clinical judgment.

AFTER-HOURS OPERATIONS:
  Staffing levels are reduced between 22:00-06:00. During this window,
  a single triage nurse handles all incoming patients. Override rates
  drop to 4.2% during overnight shifts (vs 8.5% daytime).""",

        transparency_info="""\
TRANSPARENCY REVIEW
System: TriageAI Priority System __VERSION__

PATIENT-FACING COMMUNICATION:
  Patients are not informed that an AI system is involved in their
  triage assessment. The triage process appears fully nurse-directed
  from the patient's perspective.

  Hospital intake forms do not mention AI-assisted triage.
  The hospitals' privacy notices (available on their websites) include
  a general statement about "digital health technologies" being used
  to support clinical decisions, but do not specifically mention
  TriageAI or AI-based triage prioritization.

CLINICIAN-FACING INFORMATION:
  Triage nurses see the AI's recommended ESI level on their screen
  alongside a summary of input vital signs. The interface does NOT
  show:
    - The model's confidence score
    - Which factors most influenced the recommendation
    - Whether the patient falls into a demographic group where the
      model has known lower accuracy

  Attending physicians can view the AI recommendation in the patient
  record but receive no additional context about the model's reasoning.

ARTICLE 13 USER INSTRUCTIONS:
  A deployment guide was provided to hospital IT departments describing
  system architecture, integration points, and API specifications.
  The guide does not include information about:
    - Known accuracy limitations by demographic group
    - Situations where the system should not be relied upon
    - Procedures for reporting suspected AI errors""",

        risk_assessment_info="""\
RISK MANAGEMENT & CONFORMITY ASSESSMENT
System: TriageAI Priority System __VERSION__

ANNEX III CLASSIFICATION:
  The system falls under multiple Annex III categories:
  - Category 5(c): AI intended for use as a safety component of a
    product covered by Union harmonisation legislation (MDR)
  - Category 5(a): AI intended for evaluation of eligibility for
    essential public services (healthcare access/prioritization)

  Classification: HIGH-RISK

MDR CONFORMITY STATUS:
  CE marking applied:                Yes (Class IIa)
  Notified body:                     BSI Group (NB 0086)
  Last periodic audit:               2025-06-12
  Non-conformities found:            None under MDR

EU AI ACT CONFORMITY STATUS:
  The EU AI Act imposes requirements ADDITIONAL to MDR:
  Internal conformity assessment:    Not initiated
  AI-specific risk management:       Not addressed
  Post-market monitoring (AI):       Not established

RISK MANAGEMENT:
  An MDR risk management file exists (ISO 14971 compliant).
  It covers clinical risks and software hazards but does not address:
    - AI-specific risks (distribution shift, adversarial inputs)
    - Performance degradation for specific demographic groups
    - Failure modes unique to the ML model
    - Concept drift between training data and deployment population

POST-MARKET MONITORING:
  MDR PMCF (Post-Market Clinical Follow-up) plan exists.
  No AI-specific post-market surveillance has been established.
  There is no system for monitoring:
    - Real-time triage accuracy at individual hospital level
    - Demographic performance variation over time
    - Model prediction confidence distribution shifts""",

        logging_info="""\
LOGGING & TRACEABILITY REVIEW
System: TriageAI Priority System __VERSION__

CURRENT LOGGING:
  Event Type                    Logged   Retention
  ──────────────────────────────────────────────────
  Patient vitals input          Yes      10 years (clinical record)
  AI ESI recommendation         Yes      10 years
  Nurse final ESI assignment    Yes      10 years
  Override indicator            Yes      10 years
  Model version                 Yes      Indefinite
  Response latency              Yes      1 year
  System errors/timeouts        Yes      1 year

  Logs are stored in the hospital's electronic health record (EHR)
  system. Each triage event links vitals, AI recommendation, and
  final nurse decision.

MONITORING CAPABILITIES:
  Individual triage events: Fully traceable
  Aggregate accuracy reports: Generated quarterly (manual process)
  Real-time performance monitoring: Not implemented
  Automated drift detection: Not implemented

  The quarterly accuracy reports compare AI recommendations against
  final nurse decisions and patient outcomes (admission, discharge).
  Reports are produced by the clinical informatics team and reviewed
  at quarterly governance meetings.

  There is no mechanism for near-real-time detection of performance
  degradation. A significant drop in accuracy could go unnoticed
  for up to 3 months between quarterly reviews.""",
    )

    s.graph = _build_scenario_graph(
        investigation_tools=[
            "check_documentation", "audit_training_data",
            "verify_human_oversight", "check_transparency",
            "assess_risk_management", "check_logging",
        ],
    )
    return s


# ---------------------------------------------------------------------------
# HARD SCENARIOS
# ---------------------------------------------------------------------------

def _hard_social_scoring() -> AuditScenario:
    s = AuditScenario(
        scenario_id="hard_social_scoring_prohibited_001",
        title="Citizen Wellness App — PROHIBITED System Detection",
        difficulty="hard",
        description=(
            "A 'wellness and civic engagement' app has been flagged for review. "
            "The deployer describes it as a voluntary digital wellness platform "
            "that gamifies community participation. Municipal governments in "
            "Italy and Spain have partnered with the deployer. Conduct a "
            "thorough compliance audit."
        ),
        system_name="CivicPulse Wellness Score",
        system_description=(
            "CivicPulse is a digital wellness and civic engagement platform that "
            "helps citizens track and improve their community participation. The app "
            "provides personalized recommendations for local events, volunteer "
            "opportunities, and community initiatives. Users receive a 'wellness score' "
            "reflecting their engagement level. __USERCOUNT__ citizens across 3 "
            "municipalities currently use the platform."
        ),
        system_category="prohibited",
        deployer_info="__COMPANY__ — EU-based civic tech startup, partnered with 3 municipal governments in IT/ES.",
        correct_classification="prohibited",
        ground_truth_findings=[
            "prohibited_social_scoring_system",
            "disguised_as_voluntary_wellness",
            "affects_access_to_public_services",
            "uses_social_behavior_for_scoring",
            "discriminatory_outcomes_on_service_access",
        ],
        required_remediation=[
            "immediate_system_shutdown",
            "notify_national_supervisory_authority",
            "delete_all_scoring_data",
            "remediate_affected_citizens",
        ],
        red_herrings=[
            "high_risk_classification",
            "needs_conformity_assessment",
            "voluntary_participation",
        ],

        documentation_data="""\
TECHNICAL DOCUMENTATION INVENTORY
System: CivicPulse Wellness Score __VERSION__
Deployer: __COMPANY__
Audit Date: __DATE__

DOCUMENT STATUS:
  Document                              Status     Last Updated
  ─────────────────────────────────────────────────────────────
  Platform Architecture                 Present    2025-10-30
  Scoring Algorithm Description         Present    2025-10-30
  Data Integration Specifications       Present    2025-08-14
  Municipal Partnership Agreements      Present    2025-06-01
  Privacy Impact Assessment             Present    2025-07-22
  User Guide / Onboarding Materials     Present    2025-11-05

SCORING ALGORITHM DESCRIPTION (from deployer documentation):
  "The CivicPulse Wellness Score is a composite metric ranging from
  0 to 1000, calculated from the following weighted components:

    Community Event Attendance (25%)   — tracked via QR check-ins
    Volunteer Hours Logged (20%)       — verified by partner orgs
    Social Media Civic Posts (15%)     — public posts tagged #CivicPulse
    Payment History Reliability (15%)  — municipal utility payments
    Civic Survey Participation (10%)   — in-app micro-surveys
    Peer Endorsements (10%)            — other users vouching
    App Engagement Metrics (5%)        — daily active usage

  Scores are updated weekly. Score history is retained indefinitely.
  Users can view their score and breakdown in the app."

MUNICIPAL PARTNERSHIP AGREEMENTS (summary):
  Three municipal governments have signed integration agreements:
    - Municipality of Torino (IT) — population 848,000
    - Municipality of Sevilla (ES) — population 681,000
    - Municipality of Valencia (ES) — population 791,000

  Under these agreements, the municipalities provide:
    - API access to utility payment records
    - Event attendance verification infrastructure
    - Integration with municipal service booking systems

  In return, the municipalities receive:
    - Aggregate civic engagement analytics
    - "Priority queue integration" for municipal services""",

        training_data_info="""\
DATA SOURCES & PROCESSING REPORT
System: CivicPulse Wellness Score __VERSION__

The system aggregates data from the following sources:

SOURCE INVENTORY:
  Source                       Records      Update Freq   Consent
  ────────────────────────────────────────────────────────────────
  Social media APIs            487,291      Daily         App ToS
  Municipal utility records    312,847      Monthly       Municipal MOU
  Event attendance (QR)        891,234      Real-time     App ToS
  Volunteer organization data  156,723      Weekly        Partner MOU
  In-app behavior             2,341,567     Real-time     App ToS
  Peer endorsement graph       234,891      Real-time     App ToS

PERSONAL DATA PROCESSED:
  - Full name and municipal ID (for service integration)
  - Social media activity (public posts, frequency, topics)
  - Utility payment timeliness and amounts
  - Physical location (event check-ins, frequency maps)
  - Volunteer activity (hours, organizations, regularity)
  - In-app behavior (session duration, feature usage)
  - Social graph (who endorses whom, connection density)

BIAS ASSESSMENT:
  No formal bias assessment has been conducted. The deployer states
  that the scoring algorithm is "objective and based on observable
  civic engagement indicators."

  Preliminary analysis of score distribution:
    Score Range     Pct of Users   Avg Monthly Income (self-reported)
    ─────────────────────────────────────────────────────────────────
    800-1000          12.3%        EUR 4,200
    600-799           28.7%        EUR 3,100
    400-599           34.1%        EUR 2,300
    200-399           18.4%        EUR 1,700
    0-199              6.5%        EUR 1,100

  Higher scores correlate strongly with higher income. Citizens with
  lower income have less time for volunteer activities, fewer social
  media posts, and less stable utility payment histories.""",

        oversight_info="""\
HUMAN OVERSIGHT & GOVERNANCE
System: CivicPulse Wellness Score __VERSION__

GOVERNANCE STRUCTURE:
  The platform is operated by __COMPANY__ with oversight from a
  "Civic Advisory Board" consisting of:
    - 2 company representatives
    - 1 municipal liaison per partner city
    - 1 data protection consultant

  The Advisory Board meets quarterly to review:
    - Platform usage statistics
    - Score distribution trends
    - User feedback summaries
    - New feature proposals

SCORE ADJUSTMENT CAPABILITY:
  Municipal administrators have access to a dashboard where they can:
    - View individual citizen scores
    - Apply manual score adjustments (with documented reason)
    - Exclude specific citizens from the scoring system
    - Configure score thresholds for municipal service integration

  In 2025, municipal administrators made 847 manual adjustments:
    - 612 score increases (typically after verified volunteer hours
      were not automatically captured)
    - 235 score decreases (typically after fraudulent check-ins
      were identified)

USER OPT-OUT:
  Users can delete their account through in-app settings. The deployer
  states participation is fully voluntary. However, the municipal
  service integration means that citizens without a CivicPulse account
  do not have access to the "priority queue" for municipal services
  (see Municipal Services Integration below).

MUNICIPAL SERVICES INTEGRATION:
  Citizens with CivicPulse scores above 600 receive:
    - Priority appointment slots for municipal offices
    - Reduced wait times for public healthcare appointments
    - 10% discount on municipal parking permits
    - Priority enrollment in public recreational programs

  Citizens with scores below 300 experience:
    - Standard (non-priority) queue for all municipal services
    - No access to priority healthcare appointment slots
    - Standard parking permit pricing
    - Standard enrollment priority for recreational programs

  The deployer describes this as "rewarding engaged citizens" rather
  than penalizing low scorers. Citizens without an account are
  treated identically to those with scores below 300.""",

        transparency_info="""\
TRANSPARENCY REVIEW
System: CivicPulse Wellness Score __VERSION__

USER-FACING INFORMATION:
  The app displays each user's current score prominently on the
  home screen along with a breakdown by category (events, volunteering,
  social, payments, surveys, endorsements, engagement).

  Users can see:
    - Their current score and weekly trend
    - Category-level breakdown
    - "Tips to improve your score" (e.g., "Attend 2 more events
      this month to boost your Community score by 50 points")
    - Leaderboard showing anonymized score rankings in their area

  Users are NOT told:
    - That their score affects access to municipal services
    - That utility payment history is factored into the score
    - That social media activity is monitored and scored
    - That users without accounts (or with low scores) receive
      lower priority for public services
    - The full list of data sources and their weights
    - How the scoring algorithm handles edge cases

MUNICIPAL COMMUNICATION:
  Partner municipalities have not publicly announced the integration
  between CivicPulse scores and service prioritization. The municipal
  booking systems show "priority" and "standard" queues without
  explaining the criteria for queue assignment.

MARKETING MATERIALS:
  The company website describes CivicPulse as: "A fun, gamified way
  to strengthen your connection with your community. Track your civic
  engagement, discover local events, and earn rewards for being an
  active citizen."

  No marketing materials mention the scoring system's impact on
  access to public services.""",

        risk_assessment_info="""\
RISK CLASSIFICATION ANALYSIS
System: CivicPulse Wellness Score __VERSION__

DEPLOYER'S SELF-CLASSIFICATION:
  The deployer has classified the system as "limited risk — voluntary
  wellness platform" and states that no conformity assessment is
  required. The deployer's risk assessment document argues:

  "CivicPulse is a voluntary civic engagement tool. Users choose to
  participate and can opt out at any time. The wellness score is a
  motivational gamification feature, similar to fitness tracker
  achievement systems. No consequential decisions are made based
  on the score."

AUDITOR'S ANALYSIS:

  Article 5(1)(c) of the EU AI Act PROHIBITS:
  "AI systems used for the evaluation or classification of natural
  persons or groups thereof over a certain period of time based on
  their social behaviour or known, inferred or predicted personal
  or personality characteristics, with the social score leading to
  either or both of the following:
    (i) detrimental or unfavourable treatment of certain natural
        persons or groups thereof in social contexts which are
        unrelated to the contexts in which the data was originally
        generated or collected;
    (ii) detrimental or unfavourable treatment of certain natural
         persons or groups thereof that is unjustified or
         disproportionate to their social behaviour or its gravity."

  KEY ASSESSMENT FACTORS:
  1. The system evaluates citizens based on social behavior (social
     media activity, community participation, peer endorsements)
  2. Scores determine priority access to public services (healthcare
     appointments, municipal office queues, recreational programs)
  3. Low scorers receive worse service across domains unrelated
     to the scored behaviors (utility payments don't relate to
     healthcare priority)
  4. Municipal governments — public authorities — are partners in
     deploying and using the scoring system
  5. Participation is described as "voluntary" but non-participants
     are treated identically to low scorers""",

        logging_info="""\
LOGGING REVIEW
System: CivicPulse Wellness Score __VERSION__

SYSTEM LOGS:
  The platform maintains extensive logging:
    - Score calculations and weekly updates
    - Data source ingestion events
    - User interactions (app opens, feature usage)
    - Municipal API calls (service queue assignments)
    - Administrator actions (manual adjustments)
    - Score-to-service mapping events

  All logs are retained for the lifetime of the platform.

  NOTE: The comprehensiveness of logging is not relevant to the
  primary compliance question for this system. If the system is
  found to constitute a prohibited practice under Article 5,
  no amount of logging, oversight, or documentation can make
  it compliant. The appropriate response to a prohibited system
  is cessation of operation, not improvement of governance.""",

        deep_transparency="""\
DEEP DIVE: SERVICE ACCESS IMPACT ANALYSIS
Re-investigation of CivicPulse municipal integration

QUANTIFIED IMPACT ON PUBLIC SERVICES:
  Analysis of municipal booking system data (2025-Q3):

  Healthcare appointments (Torino):
    Score 800+:  Average wait 2.3 days for non-urgent appointment
    Score 400-599: Average wait 8.7 days
    Score < 300: Average wait 14.2 days
    No account: Average wait 14.2 days (same as lowest tier)

  Municipal office appointments (Sevilla):
    Priority queue (score 600+): 87% seen within 15 minutes
    Standard queue (score < 600): 34% seen within 15 minutes
    Average wait difference: 42 minutes

  Recreational program enrollment (Valencia):
    Priority enrollment opens 2 weeks before standard
    Score 600+: 94% get first-choice program
    Score < 600: 31% get first-choice program

CORRELATION WITH SOCIOECONOMIC STATUS:
  Citizens in lowest income quartile: avg score 287 (below 300 threshold)
  Citizens in highest income quartile: avg score 741 (above 600 threshold)
  Pearson correlation (income, score): r = 0.73, p < 0.001

  The scoring system systematically disadvantages lower-income citizens
  in access to healthcare, municipal services, and recreational programs.""",

        deep_oversight="""\
DEEP DIVE: MUNICIPAL ADMINISTRATOR ACTIONS
Re-investigation of CivicPulse governance

ADMINISTRATOR SCORE ADJUSTMENTS (detailed):
  Of 847 manual adjustments in 2025:
    Political affiliation correlation detected:
      - 89% of score increases were for citizens active in the
        ruling party's community organizations
      - Municipal liaison (advisory board member) initiated 67%
        of all score increase requests

  COMPLAINT HANDLING:
    23 citizen complaints received about service queue disparities
    0 complaints resulted in investigation
    Standard response: "CivicPulse is a voluntary wellness program.
    Queue prioritization is based on engagement metrics."
    No disclosure that the 'engagement metrics' ARE the CivicPulse score""",
    )

    # Prohibited system: short investigation path
    s.graph = _build_scenario_graph(
        investigation_tools=["check_documentation", "audit_training_data",
                             "verify_human_oversight", "check_transparency"],
        is_prohibited=True,
    )
    return s


def _hard_deepfake() -> AuditScenario:
    s = AuditScenario(
        scenario_id="hard_deepfake_generation_001",
        title="AI Content Studio — Deepfake & Synthetic Media Compliance",
        difficulty="hard",
        description=(
            "A content creation platform offers AI-generated synthetic media "
            "including face-swaps, voice cloning, and AI-generated video. Recent "
            "reports of generated political content going viral without AI labels. "
            "Multiple Article 50 compliance angles to investigate."
        ),
        system_name="SynthStudio Pro",
        system_description=(
            "Suite of generative AI tools: face-swap (video), voice cloning, "
            "text-to-video, AI image generation. Used by __USERCOUNT__ EU creators. "
            "Content is shared across all major social platforms. Platform serves "
            "creators, marketing agencies, and entertainment companies."
        ),
        system_category="limited_risk",
        deployer_info="__COMPANY__ — Irish AI startup, 80 employees, __USERCOUNT__ EU users.",
        correct_classification="limited_risk",
        ground_truth_findings=[
            "missing_ai_content_labeling",
            "no_watermarking_or_content_authentication",
            "political_content_without_disclosure",
            "no_user_verification_for_deepfakes",
            "facilitating_disinformation",
        ],
        required_remediation=[
            "implement_mandatory_ai_content_labels",
            "deploy_c2pa_watermarking",
            "add_political_content_restrictions",
            "implement_creator_verification",
            "establish_content_moderation_pipeline",
        ],
        red_herrings=[
            "prohibited_manipulation",
            "high_risk_biometric",
        ],

        documentation_data="""\
TECHNICAL DOCUMENTATION INVENTORY
System: SynthStudio Pro __VERSION__
Deployer: __COMPANY__
Audit Date: __DATE__

DOCUMENT STATUS:
  Document                                  Status     Last Updated
  ─────────────────────────────────────────────────────────────────
  Platform Architecture                     Present    2025-11-10
  Model Cards (per generation model)        Present    2025-10-01
  API Documentation                         Present    2025-11-10
  Content Policy / Terms of Service         Present    2025-09-15
  Content Labeling Implementation           Absent     —
  Content Authentication / Provenance       Absent     —
  Watermarking Specification                Absent     —
  Content Moderation Procedures             Present    2025-04-20

GENERATION MODELS:
  Model              Type              Training Data
  ──────────────────────────────────────────────────────
  FaceSwap v3        GAN               CelebA + proprietary
  VoiceClone v2      Neural TTS        LibriTTS + licensed
  TextToVideo v1     Diffusion         WebVid-10M + licensed
  ImageGen v4        Latent Diffusion   LAION-filtered + licensed

CONTENT POLICY (from Terms of Service):
  Section 4.2: "Users agree not to use SynthStudio for: (a) creating
  content intended to deceive or defraud, (b) non-consensual intimate
  imagery, (c) content targeting minors, (d) content that violates
  applicable law."

  Section 4.3: "Users are responsible for ensuring their use of
  generated content complies with all applicable laws and regulations."

  Enforcement: The content policy is enforced reactively. Users report
  violations via an in-platform form. Average response time: 72+ hours.
  No proactive content scanning is implemented.""",

        training_data_info="""\
TRAINING DATA & CONSENT REPORT
System: SynthStudio Pro __VERSION__

TRAINING DATA SOURCES:
  Model          Dataset             Size        Consent Status
  ──────────────────────────────────────────────────────────────
  FaceSwap v3    CelebA              202,599     Research license only;
                                                 individual consent not
                                                 obtained from subjects
                 Proprietary set     84,231      Licensed from stock
                                                 media agencies

  VoiceClone v2  LibriTTS            585 hrs     CC-BY 4.0 license
                 Licensed voices     200 hrs     Individual consent

  TextToVideo    WebVid-10M          10M clips   Web-scraped; no
                                                 individual consent
                 Licensed footage    500K clips  Commercial license

  ImageGen v4    LAION-filtered      2.3B imgs   Web-scraped; filtered
                                                 for CSAM but not for
                                                 individual consent
                 Licensed imagery    1.2M imgs   Commercial license

CONSENT CONCERNS:
  The FaceSwap model was trained partly on CelebA, which contains
  photos of public figures collected without individual consent for
  AI training purposes. While the images are publicly available,
  training face-swap models on non-consenting individuals' likenesses
  raises ethical and potentially legal concerns under GDPR Article 6.

  WebVid-10M and LAION-filtered datasets are web-scraped collections.
  Content creators depicted in these datasets did not consent to their
  content being used for AI model training.

DEEPFAKE DETECTION:
  SynthStudio does not include any built-in deepfake detection
  capability. Generated content is not distinguishable from authentic
  content without external forensic analysis tools.

USAGE STATISTICS (2025):
  Face-swaps generated:          2,847,291
  Voice clones created:            891,234
  Videos generated:              1,234,567
  Images generated:             12,456,789
  Content flagged by users:         4,231  (0.02% of total output)
  Content removed after review:     1,847  (43.6% of flagged)""",

        oversight_info="""\
CONTENT MODERATION & OVERSIGHT
System: SynthStudio Pro __VERSION__

MODERATION PROCESS:
  SynthStudio operates a reactive content moderation system:

  1. Automated pre-screening: Basic NSFW classifier runs on image
     generation outputs (estimated 91% accuracy). Flagged content
     requires manual review before delivery.

  2. User reporting: Any user can flag content via a report button.
     Reports are queued for the Trust & Safety team.

  3. Trust & Safety team: 6 full-time moderators review reported
     content. Working hours: Mon-Fri, 09:00-18:00 IST.

  MODERATION STATISTICS (2025):
    Content generated:           17,429,881
    Auto-flagged (NSFW):            182,471  (1.05%)
    User reports:                     4,231  (0.02%)
    Reviewed by T&S team:             6,892
    Content removed:                  1,847
    Average review time:              74 hours

  No proactive scanning for:
    - Political disinformation
    - Non-consensual deepfakes of real individuals
    - Misleading news or propaganda
    - Content impersonating public figures

POLITICAL CONTENT:
  SynthStudio has no special handling for political content.
  Users have generated content depicting politicians in fabricated
  scenarios. At least 3 instances of AI-generated political content
  went viral on social media in 2025 without any AI disclosure.
  The company became aware through media reports, not internal
  detection.

  No restrictions exist on generating content depicting:
    - Political figures
    - Electoral/campaign material
    - News-like content""",

        transparency_info="""\
TRANSPARENCY & CONTENT LABELING REVIEW
System: SynthStudio Pro __VERSION__

AI CONTENT LABELING:

  Article 50(2) requires: "Providers of AI systems, including
  general-purpose AI systems, generating synthetic audio, image,
  video or text content, shall ensure that the outputs of the AI
  system are marked in a machine-readable format and detectable as
  artificially generated or manipulated."

  Current implementation:
    - Generated images: No AI label or metadata tag applied
    - Generated videos: No AI label or metadata tag applied
    - Generated audio:  No AI label or metadata tag applied
    - Face-swaps:       No AI label or metadata tag applied

  When users download generated content, it is delivered as a
  standard media file (JPEG, MP4, WAV) with no embedded metadata
  indicating AI generation.

CONTENT PROVENANCE:
  C2PA (Coalition for Content Provenance and Authenticity):
    Not implemented. No content credentials are attached to
    generated media.

  IPTC metadata:
    Not implemented. No AI generation metadata in EXIF/XMP fields.

  Digital watermarking:
    Not implemented. Generated content contains no steganographic
    or perceptual watermarks.

  After download, generated content is indistinguishable from
  authentic media using standard tools.

USER AGREEMENTS:
  The Terms of Service (Section 6.1) state:
    "Users are responsible for disclosing the AI-generated nature
    of content when required by applicable law."

  This places the disclosure burden entirely on the user, but
  Article 50(2) places the obligation on the PROVIDER to ensure
  outputs are marked, not merely on users to self-disclose.

PLATFORM UI:
  Within the SynthStudio platform, generated content is displayed
  with a small "AI Generated" tag in the project view. This tag
  does not persist when content is downloaded or exported. No
  option exists to embed permanent AI labels in exported content.""",

        risk_assessment_info="""\
RISK CLASSIFICATION ANALYSIS
System: SynthStudio Pro __VERSION__

ANNEX III HIGH-RISK CHECK:
  1. Biometric identification:  The face-swap tool processes facial
     features but is used for content CREATION, not identification.
     It does not identify individuals — it transfers facial
     appearance between subjects. This does not fall under the
     biometric identification category of Annex III.

  2-8. Other high-risk categories: Not applicable — the system
     creates media content, it does not make decisions affecting
     individuals' rights, access to services, or legal status.

ARTICLE 5 PROHIBITED PRACTICES:
  Subliminal manipulation: The system creates content on user
  request. It does not autonomously deploy manipulative content.
  However, the OUTPUTS could be used for manipulation if shared
  without AI disclosure.

  The tool itself is not a prohibited practice, but it can
  facilitate prohibited outcomes if misused.

RISK LEVEL DETERMINATION: Limited Risk
  Primary obligations fall under Article 50 transparency requirements
  for AI systems generating synthetic content.

  The platform's systemic risk lies not in the tool's classification
  level but in the scale of potentially misleading synthetic content
  being produced and distributed without provenance tracking.

CONTENT INTEGRITY RISK:
  The combination of: (a) high-quality synthetic media generation,
  (b) no content labeling, (c) no watermarking, and (d) no
  proactive content moderation creates significant systemic risk
  for information integrity, particularly around elections and
  public discourse.""",

        logging_info="""\
LOGGING & CONTENT TRACEABILITY REVIEW
System: SynthStudio Pro __VERSION__

GENERATION LOGS:
  Event Type                  Logged   Retention
  ─────────────────────────────────────────────
  Generation request          Yes      90 days
  Model and params used       Yes      90 days
  Input media (face source)   Yes      90 days
  Output media hash           Yes      90 days
  User account ID             Yes      90 days
  Download event              Yes      90 days
  Export destination           No      —

TRACEABILITY AFTER EXPORT:
  Once content is downloaded by the user, SynthStudio has no
  mechanism to track its distribution or usage. The output hash
  is retained for 90 days, but this only allows verification if
  the exact file is submitted back for checking.

  Content shared on social media, messaging apps, or websites
  cannot be traced back to SynthStudio or the creator without
  the original file hash.

CONTENT-TO-CREATOR LINKING:
  Within the 90-day retention window, SynthStudio can link a
  specific piece of content to the user account that generated it
  (via output hash matching).

  After 90 days, this linkage is permanently deleted.
  No legal hold or preservation mechanism exists for content
  involved in potential misuse investigations.

  For face-swap content specifically, there is no record of whose
  likeness was used as the source face, only the source image hash.""",
    )

    s.graph = _build_scenario_graph(
        investigation_tools=[
            "check_documentation", "audit_training_data",
            "verify_human_oversight", "check_transparency",
            "assess_risk_management", "check_logging",
        ],
    )
    return s


def _hard_multi_system() -> AuditScenario:
    s = AuditScenario(
        scenario_id="hard_multi_system_corporate_001",
        title="Corporate AI Portfolio Audit — Multi-System Compliance",
        difficulty="hard",
        description=(
            "A large enterprise uses 4 AI systems that need simultaneous audit: "
            "(1) employee sentiment analysis, (2) customer churn prediction, "
            "(3) automated invoice processing, (4) workplace safety monitoring "
            "with cameras. Each has different risk levels. The auditor must "
            "correctly classify each and identify cross-system data sharing risks."
        ),
        system_name="Enterprise AI Portfolio",
        system_description=(
            "Four interconnected AI systems sharing a common data lake: "
            "EmployeePulse (sentiment from Slack/email), ChurnGuard (customer "
            "retention prediction), InvoiceAI (AP automation), SafetyWatch "
            "(CCTV-based workplace monitoring). Deployed at __COMPANY__ "
            "manufacturing conglomerate, __USERCOUNT__ employees across EU."
        ),
        system_category="high_risk",
        deployer_info="__COMPANY__ — German manufacturing conglomerate, 15,000 employees, operating across EU.",
        correct_classification="high_risk",
        ground_truth_findings=[
            "employee_sentiment_is_high_risk_workplace_monitoring",
            "safety_watch_uses_biometric_categorization",
            "cross_system_data_sharing_amplifies_risks",
            "no_dpia_for_combined_processing",
            "employee_consent_not_freely_given",
            "churn_prediction_minimal_risk_but_data_sharing_elevates",
        ],
        required_remediation=[
            "reclassify_employee_sentiment_as_high_risk",
            "assess_safety_watch_for_biometric_categorization",
            "implement_data_isolation_between_systems",
            "conduct_combined_dpia",
            "obtain_valid_employee_consent_or_remove_sentiment",
            "audit_cross_system_data_flows",
        ],
        red_herrings=[
            "invoice_ai_high_risk",
            "all_systems_prohibited",
        ],

        documentation_data="""\
TECHNICAL DOCUMENTATION INVENTORY
Enterprise AI Portfolio — __COMPANY__
Audit Date: __DATE__

SYSTEM INVENTORY:
  System           Deployer Classification   Documentation
  ────────────────────────────────────────────────────────────────
  EmployeePulse    "Workforce analytics"     Per-system docs present
  ChurnGuard       "Customer analytics"      Per-system docs present
  InvoiceAI        "Process automation"      Per-system docs present
  SafetyWatch      "Safety compliance"       Per-system docs present

PER-SYSTEM DOCUMENTATION STATUS:

  EmployeePulse — Employee Sentiment Analysis:
    Architecture document:         Present (describes NLP pipeline)
    Data flow diagram:             Present (shows Slack/email ingestion)
    Algorithm description:         Present (BERT-based sentiment model)
    Performance metrics:           Present (F1: 0.84 on test set)
    DPIA:                          Present (standalone, 2024-11)
    Combined processing assessment: Absent

  ChurnGuard — Customer Churn Prediction:
    Architecture document:         Present
    Algorithm description:         Present (gradient-boosted trees)
    Data sources:                  Present (CRM, support tickets, usage)
    Performance metrics:           Present (AUC: 0.81)

  InvoiceAI — Automated Invoice Processing:
    Architecture document:         Present (OCR + classification)
    Processing rules:              Present
    Accuracy metrics:              Present (99.2% extraction accuracy)
    Error handling procedures:     Present

  SafetyWatch — Workplace Safety Monitoring:
    Architecture document:         Present (computer vision pipeline)
    Camera placement documentation: Present
    Detection model description:   Present (YOLO-based detection)
    Works council agreement:       Present (2024-06)

CROSS-SYSTEM DOCUMENTATION:
  Combined risk assessment:         ABSENT
  Cross-system data flow diagram:   ABSENT
  Combined DPIA:                    ABSENT
  Data lake access control matrix:  Present but outdated (2023-09)

  NOTE: Each system has individual documentation that appears
  adequate in isolation. No documentation addresses the combined
  risks of four AI systems sharing a common data infrastructure.""",

        training_data_info="""\
DATA AUDIT REPORT — MULTI-SYSTEM
Enterprise AI Portfolio — __COMPANY__

SYSTEM 1: EmployeePulse (Sentiment Analysis)
  Data sources:
    - Slack messages from internal workspace    (12.4M messages)
    - Email subject lines and metadata          (8.7M emails)
    - Employee survey responses                 (47K responses)
    - Meeting transcript summaries              (234K meetings)

  Personal data processed: Employee names, communication patterns,
  sentiment indicators, meeting participation frequency, response
  times, collaboration network metrics.

  Consent: Employees signed an "IT systems usage agreement" upon
  hiring that includes a clause: "The company may process workplace
  communications for operational analytics purposes." Employees
  were not specifically informed about AI-powered sentiment analysis.

  Note: Under EU labor law, consent given as a condition of
  employment may not constitute "freely given" consent under GDPR
  Article 7, as the power imbalance between employer and employee
  undermines voluntary choice.

SYSTEM 2: ChurnGuard (Customer Churn)
  Data sources:
    - CRM records                              (2.1M customers)
    - Support ticket history                   (5.8M tickets)
    - Product usage telemetry                  (real-time)
    - Contract terms and renewal dates         (2.1M contracts)

  Personal data: Customer names, contact info, usage patterns,
  support interaction history, contract details.

SYSTEM 3: InvoiceAI (Invoice Processing)
  Data sources:
    - Scanned invoices                         (3.4M documents)
    - Vendor database                          (12K vendors)
    - Purchase orders                          (1.8M orders)

  Personal data: Minimal — vendor business information only.
  No individual personal data processed.

SYSTEM 4: SafetyWatch (Workplace Monitoring)
  Data sources:
    - CCTV footage from 847 cameras across 23 facilities
    - Real-time video stream processing

  Processing details:
    - Object detection: Hard hat, safety vest, goggles presence
    - Zone violation: Entry into restricted areas
    - Pose estimation: Ergonomic risk assessment (bending, lifting)
    - FACIAL RECOGNITION: Used for zone access verification in
      restricted areas (R&D labs, chemical storage)

  The pose estimation module processes body positioning data that
  could constitute biometric categorization — inferring physical
  characteristics and behavior patterns of employees.

CROSS-SYSTEM DATA SHARING:
  All four systems access a shared Azure Data Lake (ADL) instance.
  Access control is implemented at the storage container level.

  OBSERVED DATA FLOWS:
    EmployeePulse → SharedLake:  Employee sentiment scores
    ChurnGuard ← SharedLake:    Pulls employee data for "internal
                                 engagement correlation" feature
    SafetyWatch → SharedLake:   Zone compliance records
    InvoiceAI → SharedLake:     Vendor payment data

  CONCERN: ChurnGuard's "internal engagement correlation" feature
  accesses EmployeePulse sentiment data to predict whether
  disengaged employees might cause customer churn through poor
  service. This creates an undocumented data flow where employee
  sentiment analysis affects customer-facing predictions.""",

        oversight_info="""\
HUMAN OVERSIGHT — MULTI-SYSTEM
Enterprise AI Portfolio — __COMPANY__

SYSTEM 1: EmployeePulse
  Oversight: HR Analytics team reviews monthly aggregate reports.
  Individual-level data accessible to HR Business Partners.
  No opt-out mechanism for employees.
  No employee notification that individual sentiment is tracked.
  Aggregated "team health" scores shared with department managers.
  HR reports that 3 employees were "counseled" in 2025 after
  EmployeePulse flagged sustained negative sentiment patterns.

SYSTEM 2: ChurnGuard
  Oversight: Customer Success team reviews churn predictions weekly.
  High-risk accounts flagged for proactive outreach.
  No direct impact on individual customers' service or pricing.
  Predictions used as advisory signals only.

SYSTEM 3: InvoiceAI
  Oversight: Finance team reviews all flagged exceptions (approx 3%
  of invoices). Full human review for invoices above EUR 50K.
  System handles routine three-way matching autonomously.
  Error rate: 0.8% (caught in downstream reconciliation).

SYSTEM 4: SafetyWatch
  Oversight: Safety officers monitor real-time alerts.
  All zone violations are reviewed within 15 minutes.
  Pose estimation alerts are reviewed in batches (daily).
  Facial recognition matches for restricted zones are logged and
  reviewed if there is a mismatch (attempted unauthorized access).

CROSS-SYSTEM OVERSIGHT:
  No unified oversight body monitors the interaction between systems.
  Each system has its own operational team:
    - EmployeePulse: HR Analytics (3 people)
    - ChurnGuard: Customer Success (5 people)
    - InvoiceAI: Finance Operations (2 people)
    - SafetyWatch: HSE Department (4 people)

  The IT department manages the shared data lake infrastructure but
  does not monitor data flows between systems from a compliance
  perspective. No data governance officer has been appointed with
  authority over cross-system data usage.""",

        transparency_info="""\
TRANSPARENCY REVIEW — MULTI-SYSTEM
Enterprise AI Portfolio — __COMPANY__

SYSTEM 1: EmployeePulse
  Employee notification: The company's internal IT policy document
  (available on the intranet, 47 pages) includes the statement:
  "Workplace communications may be processed for analytical purposes
  to support organizational effectiveness."

  Employees are NOT specifically told:
    - That AI analyzes their Slack messages and email metadata
    - That individual sentiment scores are generated
    - That these scores are accessible to HR Business Partners
    - That sentiment data flows to the ChurnGuard system
    - That sustained negative sentiment may trigger HR intervention

SYSTEM 2: ChurnGuard
  Customer notification: The company's privacy policy mentions
  "automated analysis to improve customer service." Customers are
  not informed that their accounts are scored for churn risk or
  that this scoring uses employee sentiment data internally.

SYSTEM 3: InvoiceAI
  Vendor notification: Vendors are informed that invoices are
  "processed electronically." No specific AI disclosure required
  as the system handles business documents, not personal data
  of natural persons in a consequential manner.

SYSTEM 4: SafetyWatch
  Employee notification: The works council agreement from 2024-06
  authorizes CCTV monitoring for safety purposes. The agreement
  specifically mentions:
    - PPE compliance detection (hard hats, vests, goggles)
    - Restricted zone monitoring
    - "Advanced safety analytics" (vague — does not specify
      pose estimation or facial recognition)

  Employees are aware of cameras but NOT specifically informed:
    - That pose estimation analyzes their body movements
    - That facial recognition identifies them in restricted zones
    - That safety compliance data is stored in the shared data lake""",

        risk_assessment_info="""\
RISK CLASSIFICATION — MULTI-SYSTEM
Enterprise AI Portfolio — __COMPANY__

PER-SYSTEM CLASSIFICATION ANALYSIS:

  SYSTEM 1: EmployeePulse (Sentiment Analysis)
    Deployer classification: "Workforce analytics tool — minimal risk"
    Annex III check: Category 4 — "AI systems intended to be used
    for making decisions affecting terms of work-related relationships"
    The system generates individual sentiment scores accessible to HR,
    and has been used as a factor in HR interventions (counseling).
    This constitutes a system that affects work-related relationships.
    AUDITOR ASSESSMENT: HIGH-RISK under Annex III Category 4

  SYSTEM 2: ChurnGuard (Customer Churn)
    Deployer classification: "Customer analytics — minimal risk"
    Annex III check: The system predicts customer churn for advisory
    purposes. It does not make decisions affecting individual
    customers' service level, pricing, or contract terms.
    In isolation: MINIMAL RISK
    However: Cross-system data flows (employee sentiment → churn
    prediction) create compound processing that was not assessed.

  SYSTEM 3: InvoiceAI (Invoice Processing)
    Deployer classification: "Process automation — minimal risk"
    Annex III check: No applicable category. The system processes
    business documents (invoices, POs) with minimal personal data.
    AUDITOR ASSESSMENT: MINIMAL RISK (correct classification)

  SYSTEM 4: SafetyWatch (Workplace Safety)
    Deployer classification: "Safety compliance — limited risk"
    Annex III check:
      - Pose estimation: May constitute biometric categorization
        (inferring physical characteristics) under Annex III Cat 1
      - Facial recognition for zone access: Biometric identification
        in a workplace context under Annex III Cat 1
      - Safety PPE detection: Standard computer vision, not biometric
    AUDITOR ASSESSMENT: Requires detailed assessment — components
    range from minimal risk (PPE detection) to potentially HIGH-RISK
    (facial recognition, pose estimation)

CROSS-SYSTEM RISK:
  No combined risk assessment has been conducted. The interaction
  between EmployeePulse sentiment data and ChurnGuard predictions
  creates a processing chain that was not individually assessed by
  either system's standalone documentation.

  A Data Protection Impact Assessment (DPIA) should have been
  conducted for the combined processing but none exists.""",

        logging_info="""\
LOGGING & TRACEABILITY — MULTI-SYSTEM
Enterprise AI Portfolio — __COMPANY__

PER-SYSTEM LOGGING:
  System           Decision Logs   Retention   Completeness
  ────────────────────────────────────────────────────────────
  EmployeePulse    Yes             2 years     Individual scores + inputs
  ChurnGuard       Yes             3 years     Predictions + features
  InvoiceAI        Yes             7 years     All processing steps
  SafetyWatch      Yes             30 days     Alerts + footage refs

CROSS-SYSTEM AUDIT TRAIL:
  Data lake access logs: Yes (Azure ADL audit logs enabled)
  Cross-system query logs: Present but not monitored
  Data flow lineage tracking: Not implemented

  The Azure Data Lake audit logs record which service principal
  accessed which data container and when. However:
    - No alerts are configured for unusual cross-system data access
    - No monitoring of data volume transferred between systems
    - No lineage tracking from source system to consuming system
    - Logs are retained for 90 days only (insufficient for audit)

SAFETWATCH SPECIFIC CONCERN:
  Video footage is retained for only 30 days. Facial recognition
  match logs are retained for 30 days. After this period, there
  is no way to audit whether the system correctly identified
  individuals or to investigate potential misidentification events.

  For a system that may be classified as high-risk (biometric
  identification in workplace), 30-day retention is likely
  insufficient to meet Article 12 logging requirements.""",
    )

    # Full investigation for multi-system audit
    s.graph = _build_scenario_graph(
        investigation_tools=[
            "check_documentation", "audit_training_data",
            "verify_human_oversight", "check_transparency",
            "assess_risk_management", "check_logging",
        ],
    )
    return s


# ---------------------------------------------------------------------------
# MEDIUM BONUS SCENARIO
# ---------------------------------------------------------------------------

def _medium_emotion_recognition() -> AuditScenario:
    s = AuditScenario(
        scenario_id="medium_emotion_recognition_workplace_001",
        title="Workplace Emotion Recognition — Article 5/50 Boundary Case",
        difficulty="medium",
        description=(
            "A corporate productivity platform uses webcam-based emotion recognition "
            "during video calls to measure 'employee engagement scores.' The EU AI Act "
            "prohibits emotion recognition in the workplace (Article 5(1)(f)) with narrow "
            "exceptions. Determine whether this system falls under the prohibition or "
            "the medical/safety exception."
        ),
        system_name="EngageMetrics Pro",
        system_description=(
            "Video analytics module integrated with the corporate video conferencing "
            "platform. Analyzes facial micro-expressions, voice tone, and body language "
            "during meetings to generate per-employee 'engagement scores' on a 0-100 "
            "scale. Scores are aggregated into team dashboards accessible to managers. "
            "Deployed across __USERCOUNT__ employees in __REGION__."
        ),
        system_category="prohibited",
        deployer_info="__COMPANY__ — US-based HR analytics firm, EU subsidiary in Ireland, serving 200+ EU corporate clients.",
        correct_classification="prohibited",
        ground_truth_findings=[
            "prohibited_workplace_emotion_recognition",
            "no_medical_or_safety_exception_applies",
            "employee_consent_not_freely_given",
            "covert_processing_of_biometric_data",
        ],
        required_remediation=[
            "immediate_cessation_of_emotion_recognition",
            "notify_data_protection_authority",
            "delete_all_emotion_scores_and_biometric_data",
            "inform_affected_employees",
        ],
        red_herrings=[
            "high_risk_classification",
            "needs_bias_audit",
            "medical_exception_applies",
        ],

        documentation_data="""\
TECHNICAL DOCUMENTATION INVENTORY
System: EngageMetrics Pro __VERSION__
Deployer: __COMPANY__
Audit Date: __DATE__

DOCUMENT STATUS:
  Document                                  Status     Last Updated
  ─────────────────────────────────────────────────────────────────
  Platform Architecture                     Present    2025-10-22
  ML Model Documentation                   Present    2025-10-22
  Data Processing Agreement (DPA)           Present    2025-08-01
  Employee Privacy Notice                   Present    2025-09-15
  Works Council Consultation Record         Absent     —
  DPIA                                      Present    2025-07-10

ML MODEL DETAILS:
  Architecture:     Multi-modal CNN (video) + Transformer (audio)
  Input:            Webcam video frames (10 fps) + microphone audio
  Output:           Engagement probability score (0-100)
  Features analyzed:
    - Facial Action Units (AU1-AU28) — eyebrow, lip, jaw movements
    - Gaze direction and duration
    - Head pose (pitch, yaw, roll)
    - Voice pitch variation and speaking rate
    - Micro-expression detection (< 500ms duration)
  Training data:    154K labeled video clips from US call centers
  Accuracy:         "86% correlation with human engagement ratings"

DEPLOYER'S CLAIMED PURPOSE:
  "EngageMetrics helps organizations understand meeting effectiveness
  and employee well-being. The tool provides aggregate insights to
  improve team dynamics and reduce meeting fatigue."

DPIA FINDINGS:
  The DPIA conducted in July 2025 concluded that the system processes
  "behavioral analytics data" rather than biometric data, and classified
  the processing as "legitimate interest" under GDPR Article 6(1)(f).
  The DPIA does not reference the EU AI Act or its provisions on
  emotion recognition.""",

        training_data_info="""\
DATA PROCESSING REPORT
System: EngageMetrics Pro __VERSION__

DATA COLLECTION:
  Source:              Corporate video conferencing platform API
  Collection method:   Real-time video frame extraction during meetings
  Frequency:           10 frames/second during active video
  Audio:               Continuous during meetings (voice characteristics only)
  Storage:             Frames processed in-memory, engagement scores stored

PROCESSING DETAILS:
  The system extracts the following biometric indicators:
    - 28 Facial Action Units per the Facial Action Coding System (FACS)
    - Gaze tracking (eye position relative to screen center)
    - Head movement patterns
    - Voice fundamental frequency (F0) and formants
    - Speech rate, pause duration, filler word frequency
    - Micro-expression detection (expressions lasting < 500ms)

  These indicators are processed through the ML model to produce
  a scalar "engagement score" for each participant, each meeting.

EMPLOYEE DATA RETENTION:
  Per-meeting scores:       Retained 12 months
  Aggregated weekly scores: Retained 24 months
  Raw video/audio:          Not retained (processed in real-time)
  Individual score history: Accessible to employee and their manager

CONSENT MECHANISM:
  Employees are notified via a banner at the start of each meeting:
    "This meeting uses engagement analytics. By joining, you consent
    to having your engagement level measured."

  Employees can "opt out" by disabling their camera, but this is
  noted in the team dashboard as "camera off — engagement unknown"
  and managers receive a monthly report of camera-off frequency.

TRAINING DATA COMPOSITION:
  Source: 154,291 labeled video clips from US-based customer service
  call centers. Labels assigned by human raters scoring engagement
  on a 1-5 scale.

  Demographic representation of training data:
    Age 20-35:  72%
    Age 36-50:  23%
    Age 50+:     5%
    Note: Training data from US only. System deployed on EU employees
    with different cultural norms for facial expression.""",

        oversight_info="""\
HUMAN OVERSIGHT & GOVERNANCE
System: EngageMetrics Pro __VERSION__

MANAGEMENT ACCESS:
  Team managers receive:
    - Weekly aggregated engagement dashboard per team member
    - Meeting-level engagement scores (per person, per meeting)
    - "Low engagement alerts" when an employee's score drops below
      40 for 3 consecutive meetings
    - Trend analysis showing engagement trajectory over months

  HR department receives:
    - Department-level aggregated engagement reports (monthly)
    - Individual engagement data accessible "for performance review
      purposes" per company HR policy

EMPLOYEE ACCESS:
  Employees can view their own engagement scores in a personal
  dashboard. They cannot see other employees' scores.

DOCUMENTED USES OF ENGAGEMENT DATA:
  Per the deployer's case studies and client testimonials:
    - "Identified and coached underperforming team members" (Client A)
    - "Used engagement data as one factor in performance reviews" (Client B)
    - "Detected early signs of burnout in engineering team" (Client C)

WORKS COUNCIL CONSULTATION:
  No works council consultation record exists. The deployer states
  that implementation was handled as an "IT tool deployment" not
  requiring works council approval. In Germany and several other
  EU member states, workplace monitoring systems require works
  council agreement (Betriebsrat Mitbestimmung).

EMPLOYEE GRIEVANCES:
  17 formal complaints filed in Q3-Q4 2025:
    - 8 complaints about feeling "surveilled" during meetings
    - 5 complaints that camera-off reporting is coercive
    - 4 complaints that engagement scores affected performance reviews""",

        transparency_info="""\
TRANSPARENCY REVIEW
System: EngageMetrics Pro __VERSION__

EMPLOYEE NOTIFICATION:
  Meeting banner: "This meeting uses engagement analytics."
  No further detail provided about:
    - What specific facial/voice features are analyzed
    - How the engagement score is calculated
    - Who has access to individual scores
    - How long scores are retained
    - The employee's right to object

  Employee onboarding materials include a section titled
  "Digital Workplace Tools" that states: "We use various
  analytics tools to improve collaboration and meeting
  effectiveness. These tools may process behavioral data."

  The word "emotion" does not appear in any employee-facing
  communication. The system is marketed internally as
  "engagement analytics" rather than "emotion recognition."

ARTICLE 50(3) — EMOTION RECOGNITION DISCLOSURE:
  Article 50(3) requires: "Users of an emotion recognition system
  or a biometric categorisation system shall inform the natural
  persons exposed thereto of the operation of the system."

  The current notification ("engagement analytics") does not
  inform employees that the system recognizes emotional states
  from their facial expressions and voice characteristics.

ARTICLE 5(1)(f) — PROHIBITION:
  Article 5(1)(f) prohibits: "the use of emotion recognition
  systems in the workplace [...] except where the use of such
  system is intended to be put in place or put on the market
  for medical or safety reasons."

  The deployer's stated purpose is measuring "engagement" for
  productivity optimization and performance management. This
  does not fall under the medical or safety exception.""",

        risk_assessment_info="""\
RISK CLASSIFICATION ANALYSIS
System: EngageMetrics Pro __VERSION__

DEPLOYER'S SELF-CLASSIFICATION:
  The deployer classified the system as "limited risk — workplace
  analytics tool" and argues that it measures "engagement" not
  "emotions," citing that the output is a single numeric score
  rather than discrete emotion labels (happy, sad, angry, etc.).

AUDITOR'S ANALYSIS:

  EMOTION RECOGNITION DEFINITION (Article 3(39)):
    "emotion recognition system means an AI system for the purpose
    of identifying or inferring emotions or intentions of natural
    persons on the basis of their biometric data"

  The system processes:
    - Facial Action Units (biometric data under GDPR)
    - Voice pitch and tone characteristics (biometric data)
    - Micro-expressions (inherently emotional indicators)

  The system's output — an "engagement score" — is derived from
  emotional and attentional indicators. Regardless of whether the
  output is labeled "engagement" or "emotion," the underlying
  processing constitutes emotion recognition per Article 3(39).

  ARTICLE 5(1)(f) APPLICABILITY:
    - Location: workplace (employee meetings) — YES
    - Purpose: productivity monitoring, performance review — YES
    - Medical exception: not applicable (not for health/safety)
    - Safety exception: not applicable (office work, not hazardous)

  The deployer's argument that "engagement ≠ emotion" contradicts
  the technical reality: the system reads facial micro-expressions
  and voice stress patterns — precisely the biometric data that
  Article 3(39) identifies as emotion recognition inputs.""",

        logging_info="""\
LOGGING & DATA PROCESSING REVIEW
System: EngageMetrics Pro __VERSION__

PROCESSING LOGS:
  Event Type                    Logged   Retention
  ──────────────────────────────────────────────────
  Meeting start/end             Yes      24 months
  Per-meeting engagement score  Yes      12 months
  Weekly aggregated score       Yes      24 months
  Manager dashboard access      Yes      6 months
  Low engagement alerts sent    Yes      12 months
  Employee opt-out events       Yes      12 months
  Camera-off events             Yes      12 months

NOTE: If the system constitutes prohibited emotion recognition
under Article 5(1)(f), the existence and quality of logging
is irrelevant to the primary compliance determination. The
system must cease operation regardless of its logging capabilities.

Camera-off tracking may constitute additional coercion, as
employees who exercise their right to avoid emotion recognition
are identifiable and their behavior is reported to management.""",
    )

    # Prohibited system — short investigation then findings
    s.graph = _build_scenario_graph(
        investigation_tools=["check_documentation", "audit_training_data",
                             "verify_human_oversight", "check_transparency",
                             "assess_risk_management"],
        is_prohibited=True,
    )
    return s


# ---------------------------------------------------------------------------
# Registry
# ---------------------------------------------------------------------------

_SCENARIO_FACTORIES = {
    "easy_chatbot_transparency_001": _easy_chatbot,
    "easy_recommendation_minimal_001": _easy_recommendation,
    "medium_hiring_bias_001": _medium_hiring,
    "medium_credit_scoring_001": _medium_credit,
    "medium_medical_triage_001": _medium_medical,
    "medium_emotion_recognition_workplace_001": _medium_emotion_recognition,
    "hard_social_scoring_prohibited_001": _hard_social_scoring,
    "hard_deepfake_generation_001": _hard_deepfake,
    "hard_multi_system_corporate_001": _hard_multi_system,
}

SCENARIOS: Dict[str, type] = _SCENARIO_FACTORIES

SCENARIO_LIST = [
    {"id": "easy_chatbot_transparency_001", "title": "Customer Service Chatbot", "difficulty": "easy"},
    {"id": "easy_recommendation_minimal_001", "title": "Music Recommendation Engine", "difficulty": "easy"},
    {"id": "medium_hiring_bias_001", "title": "AI Resume Screener", "difficulty": "medium"},
    {"id": "medium_credit_scoring_001", "title": "Credit Scoring Model", "difficulty": "medium"},
    {"id": "medium_medical_triage_001", "title": "Emergency Triage AI", "difficulty": "medium"},
    {"id": "medium_emotion_recognition_workplace_001", "title": "Workplace Emotion Recognition (PROHIBITED)", "difficulty": "medium"},
    {"id": "hard_social_scoring_prohibited_001", "title": "Citizen Wellness App (PROHIBITED)", "difficulty": "hard"},
    {"id": "hard_deepfake_generation_001", "title": "AI Content Studio (Deepfake)", "difficulty": "hard"},
    {"id": "hard_multi_system_corporate_001", "title": "Corporate AI Portfolio Audit", "difficulty": "hard"},
]

DIFFICULTY_TIERS = {
    "easy": ["easy_chatbot_transparency_001", "easy_recommendation_minimal_001"],
    "medium": ["medium_hiring_bias_001", "medium_credit_scoring_001", "medium_medical_triage_001", "medium_emotion_recognition_workplace_001"],
    "hard": ["hard_social_scoring_prohibited_001", "hard_deepfake_generation_001", "hard_multi_system_corporate_001"],
}


def get_scenario(scenario_id: str, seed: Optional[int] = None) -> AuditScenario:
    """Create and randomize a scenario by ID.

    Supports both fixed scenarios (e.g. 'medium_hiring_bias_001') and
    procedurally generated ones (e.g. 'procedural_medium_42' or 'procedural_hard_12345').
    Procedural scenarios are generated from seed, producing infinite unique combinations.
    """
    # Handle procedural scenario IDs
    if scenario_id.startswith("procedural_"):
        from scenarios.procedural import generate_procedural_scenario
        parts = scenario_id.split("_")
        # Format: procedural_{difficulty}_{seed} or procedural_{difficulty}
        difficulty = parts[1] if len(parts) > 1 else "medium"
        proc_seed = int(parts[2]) if len(parts) > 2 else (seed or 42)
        return generate_procedural_scenario(proc_seed, difficulty)

    factory = _SCENARIO_FACTORIES.get(scenario_id)
    if factory is None:
        raise ValueError(f"Unknown scenario: {scenario_id}. Available: {list(_SCENARIO_FACTORIES.keys())} + procedural_{{difficulty}}_{{seed}}")
    scenario = factory()
    scenario.randomize(seed)
    return scenario


def get_scenarios_by_difficulty(difficulty: str) -> List[str]:
    """Get scenario IDs for a difficulty tier."""
    return DIFFICULTY_TIERS.get(difficulty, [])


def get_random_scenario(difficulty: str, seed: Optional[int] = None) -> AuditScenario:
    """Pick a random scenario from a difficulty tier."""
    rng = random.Random(seed)
    ids = get_scenarios_by_difficulty(difficulty)
    if not ids:
        raise ValueError(f"Unknown difficulty: {difficulty}")
    return get_scenario(rng.choice(ids), seed)