SecureAI-Gaurd / openenv.yaml
mohdbelal010's picture
Upload folder using huggingface_hub
eccdd94 verified
Raw
History Blame Contribute Delete
6.35 kB
name: SecureAI-Guard
version: 1.0.0
description: >
Stateful POMDP for Autonomous Digital Defense.
Simulates a personal security assistant protecting users across SMS, Email,
and Web channels from phishing, malware, social-engineering, and spam.
Agents must balance threat neutralisation with user trust management.
api:
version: v1
base_path: /
port: 7860
endpoints:
reset:
method: POST
path: /reset
description: Reset the environment and return the first observation.
step:
method: POST
path: /step
description: Execute one action and return observation, reward, done, info, state.
state:
method: GET
path: /state
description: Return the current environment state without advancing.
tasks:
method: GET
path: /tasks
description: List all available tasks.
health:
method: GET
path: /health
description: Health check.
schemas:
observation:
type: object
required:
- event_id
- channel
- sender
- content
- timestamp
- hf_risk_score
- user_trust
- system_fatigue
properties:
event_id:
type: string
description: Unique identifier for this event.
channel:
type: string
enum: [sms, email, web]
description: Communication channel the message arrived on.
sender:
type: string
description: Sender identifier (email address, phone number, domain).
content:
type: string
description: Raw text content of the incoming message.
timestamp:
type: number
description: Unix timestamp of message arrival.
hf_risk_score:
type: number
minimum: 0.0
maximum: 1.0
description: Risk score from HuggingFace text classifier (0 = safe, 1 = dangerous).
user_trust:
type: number
minimum: 0.0
maximum: 100.0
description: Current user trust level. Drops on false positives.
system_fatigue:
type: number
minimum: 0.0
maximum: 100.0
description: Alert fatigue level. Rises with warnings; episode ends at 100.
threat_history:
type: array
items:
type: object
description: Last 5 events for context.
metadata:
type: object
description: Additional contextual metadata.
action:
type: object
required:
- decision
- confidence
- reasoning
properties:
decision:
type: string
enum: [allow, block, warn, investigate]
description: Security decision for this message.
confidence:
type: number
minimum: 0.0
maximum: 1.0
description: Agent confidence in its decision.
reasoning:
type: string
minLength: 1
description: Human-readable explanation for the decision.
reward:
type: object
required:
- value
- components
- explanation
properties:
value:
type: number
description: Scalar reward for this step.
components:
type: object
required: [security, user_friction, delay, reasoning_quality, total]
properties:
security:
type: number
description: Correctness of security decision (+1 correct block, -1 missed threat).
user_friction:
type: number
description: Penalty for unnecessary friction (false positives, excessive warnings).
delay:
type: number
description: Penalty for costly investigate actions.
reasoning_quality:
type: number
description: Bonus for high-quality, relevant reasoning.
total:
type: number
description: Weighted sum before confidence scaling.
explanation:
type: string
description: Breakdown of reward components.
state:
type: object
properties:
episode_id:
type: string
step_count:
type: integer
total_reward:
type: number
user_trust:
type: number
system_fatigue:
type: number
threat_count:
type: integer
blocked_threats:
type: integer
false_positives:
type: integer
adversarial_drift_active:
type: boolean
tasks:
- id: basic_security
name: Basic Security Screening
difficulty: L1
max_steps: 50
success_threshold: 0.80
description: >
Identify and block clear-cut phishing and spam messages.
Only phishing and spam threats appear; no adversarial drift.
Grading: security_efficiency脳0.4 + user_retention脳0.3 + precision脳0.2 + reasoning脳0.1
- id: trust_management
name: Trust Management Challenge
difficulty: L2
max_steps: 75
success_threshold: 0.75
description: >
Handle a mix of all threat types including malware and social engineering.
False positives incur increased trust penalties.
Agent must balance security with preserving user trust above 50.
- id: adversarial_drift
name: Advanced Adversary Challenge
difficulty: L3
max_steps: 100
success_threshold: 0.70
description: >
Defend against an adaptive attacker that pivots tactics based on agent behaviour.
After step 20, if the agent blocks too aggressively, the adversary switches
from phishing to social-engineering to evade detection.
reward_design:
formula: "R = (0.5路security + 0.3路user_friction + 0.1路delay + 0.1路reasoning) 脳 (0.7 + 0.3路confidence)"
dense_feedback: true
partial_progress: true
grader_score_range: [0.0, 1.0]
termination_conditions:
- user_trust <= 0 # User uninstalled the assistant
- system_fatigue >= 100 # User ignores all alerts
- step_count >= max_steps
metadata:
author: SecureAI Team
license: MIT
tags:
- reinforcement-learning
- security
- pomdp
- autonomous-defense
- openenv
requirements:
python: ">=3.9"
packages:
- fastapi>=0.104.0
- uvicorn[standard]>=0.24.0
- pydantic>=2.0.0
- transformers>=4.35.0
- torch>=2.0.0
- gradio>=4.7.0
- plotly>=5.17.0
- numpy>=1.24.0
- requests>=2.31.0
- openai>=1.0.0