Gridmind / env /rewards.go
adityss's picture
feat: implement multi-component dense reward function and environmental logic for GridMind-RL
b81683f
// Package env implements the multi-component dense reward function for GridMind-RL.
package env
import "math"
// ComputeRewardInput bundles all inputs needed to compute the reward for one step.
type ComputeRewardInput struct {
B *BuildingState
Act ActionModel
StepCost float64 // $ cost incurred this step
EnergyKWh float64 // kWh consumed this step
TMin float64 // lower temperature bound (°C)
TMax float64 // upper temperature bound (°C)
StepCarbon float64 // gCO2 emitted this step
BatchMissed int // number of batch jobs that missed deadline this step
GridStress float64 // 0.0–1.0 grid stress signal
ShedFraction float64 // clamped load shed fraction
TaskID int // 1, 2, 3, or 4
PrevHVACLevel float64 // previous step's HVAC power level (for stability)
ChargeRate float64 // current thermal charge rate
PrevChargeRate float64 // previous step's thermal charge rate
StorageDelta float64 // change in storage level (+ = charging)
PriceCurve []float64 // full episode price curve for arbitrage calc
CurrentStep int // current step index
InstructionCard *InstructionCard // non-nil for Task 4 episodes
ActiveFaults []FaultEvent // currently active fault events for Track 3
}
// ComputeReward returns a dense RewardComponents struct from the current step inputs.
// All 7 reward components are always computed for rich per-step signal.
// Task-specific weighting is handled by the GRADING system (tasks.go), not here.
func ComputeReward(inp ComputeRewardInput) RewardComponents {
rc := RewardComponents{}
// ── 1. Cost Savings ─────────────────────────────────────────────────────
// Positive baseline minus relative cost: smart agents save money.
typicalCost := 4.0
rc.CostSavings = 1.5 - (inp.StepCost/typicalCost)*2.0
// ── 2. Temperature Constraint ────────────────────────────────────────────
// Gaussian bonus for being near setpoint; penalty outside comfort bounds.
temp := inp.B.IndoorTemperature
rc.TempConstraint = computeTempReward(temp, inp.B.SetpointTemperature, inp.TMin, inp.TMax)
// ── 3. Grid Stress Response ──────────────────────────────────────────────
// Rewards proactive grid awareness and demand-response compliance.
rc.GridResponse = computeGridResponse(inp.GridStress, inp.ShedFraction)
// ── 4. Deadline Penalty / Bonus ──────────────────────────────────────────
// Penalise missed batch jobs, reward on-track pending jobs.
if inp.BatchMissed > 0 {
rc.DeadlinePenalty = -float64(inp.BatchMissed) * 1.5
}
// Positive signal: reward for jobs still on track (not missed yet)
onTrackJobs := 0
for _, job := range inp.B.Jobs {
if !job.Completed && !job.MissedDeadline {
onTrackJobs++
}
if job.Completed && !job.MissedDeadline {
onTrackJobs++ // completed on time is even better
}
}
if onTrackJobs > 0 && inp.BatchMissed == 0 {
rc.DeadlinePenalty += float64(onTrackJobs) * 0.08
}
// ── 5. Efficiency Bonus (thermal storage utilization) ─────────────────────
// Rewards smart storage use: arbitrage + maintaining useful storage levels.
if len(inp.PriceCurve) > inp.CurrentStep {
rc.EfficiencyBonus = computeArbitrageBonus(
inp.ChargeRate,
inp.PriceCurve[inp.CurrentStep],
inp.PriceCurve,
inp.CurrentStep,
)
}
// Baseline: reward maintaining a balanced storage level (not empty, not always full)
storageLevel := inp.B.ThermalStorageLevel
if storageLevel > 0.2 && storageLevel < 0.85 {
rc.EfficiencyBonus += 0.15 // good operating range
} else if storageLevel <= 0.05 || storageLevel >= 0.98 {
rc.EfficiencyBonus -= 0.1 // extremes are wasteful
}
// ── 6. Stability Reward/Penalty ──────────────────────────────────────────
// Smooth operation earns a bonus; rapid oscillation earns a penalty.
hvacDelta := math.Abs(inp.Act.HVACPowerLevel - inp.PrevHVACLevel)
chargeDelta := math.Abs(inp.ChargeRate - inp.PrevChargeRate)
oscillation := hvacDelta*0.5 + chargeDelta*0.3
if oscillation > 0.3 {
rc.StabilityPenalty = -(oscillation - 0.3) * 0.8
} else {
// Positive reward for smooth, stable control
rc.StabilityPenalty = (0.3 - oscillation) * 0.4
}
// ── 7. Carbon Reward ─────────────────────────────────────────────────────
// Rewards low-carbon operation based on grid carbon intensity.
carbonNorm := math.Max(0, (inp.B.CarbonIntensity-100.0)/600.0)
// Baseline bonus, reduced by carbon-heavy consumption
rc.CarbonReward = 0.6 - (inp.EnergyKWh * carbonNorm * 0.25)
// Extra bonus for operating during genuinely clean grid periods
if carbonNorm < 0.3 {
rc.CarbonReward += 0.15
}
// ── 8. Instruction-Following Reward (Task 4 only) ─────────────────────────
if inp.TaskID == 4 && inp.InstructionCard != nil {
rc.InstructionReward = computeInstructionReward(inp.InstructionCard, inp.B, inp.ShedFraction, inp.GridStress)
}
// ── 9. Fault Mitigation Reward (Track 3) ──────────────────────────────
if len(inp.ActiveFaults) > 0 {
rc.FaultMitigation = computeFaultMitigationReward(inp.B, inp.ActiveFaults)
}
// ── Aggregate ────────────────────────────────────────────────────────────
// Total is the sum of all 9 reward components. Each component is computed
// independently above and contributes directly to the total signal.
rc.Total = rc.CostSavings + rc.TempConstraint + rc.GridResponse +
rc.DeadlinePenalty + rc.EfficiencyBonus + rc.StabilityPenalty + rc.CarbonReward +
rc.InstructionReward + rc.FaultMitigation
return rc
}
// computeInstructionReward scores per-step progress against the instruction card targets.
// Returns a value in roughly [-0.5, 1.0] depending on how well the agent tracks targets.
func computeInstructionReward(card *InstructionCard, b *BuildingState, shedFraction, gridStress float64) float64 {
if card == nil {
return 0.0
}
score := 0.0
weight := card.Weights["task_completion"]
if weight == 0 {
weight = 0.5
}
components := 0
total := 0.0
// KPI: energy cost cap
if maxCost, ok := card.Targets["max_cost"]; ok && maxCost > 0 {
components++
if b.CumulativeCost <= maxCost {
total += 1.0 // on track
} else {
// Proportional penalty for how far over budget we are
overRatio := (b.CumulativeCost - maxCost) / maxCost
total += math.Max(-1.0, -overRatio)
}
}
// KPI: temperature bounds
if tMin, okMin := card.Targets["t_min"]; okMin {
if tMax, okMax := card.Targets["t_max"]; okMax {
components++
temp := b.IndoorTemperature
if temp >= tMin && temp <= tMax {
total += 1.0
} else {
excess := math.Max(temp-tMax, tMin-temp)
total += math.Max(-1.0, -excess*0.3)
}
}
}
// KPI: minimum load shed during grid stress
if minShed, ok := card.Targets["min_shed_fraction"]; ok {
components++
if gridStress > 0.7 {
if shedFraction >= minShed {
total += 1.0
} else {
total += (shedFraction / minShed) - 1.0 // partial credit
}
} else {
total += 0.5 // no stress event this step — neutral
}
}
// KPI: carbon reduction (vs baseline, approximated by carbon intensity signal)
if _, ok := card.Targets["carbon_reduction"]; ok {
components++
// Proxy: reward operating when carbon intensity is low
carbonNorm := math.Max(0, (b.CarbonIntensity-100.0)/600.0)
if carbonNorm < 0.4 {
total += 1.0
} else {
total += 1.0 - carbonNorm
}
}
if components == 0 {
return 0.0
}
score = (total / float64(components)) * weight
return math.Max(-0.5, math.Min(1.0, score))
}
// computeTempReward returns a reward based on how close the indoor temperature
// is to the setpoint, with a hard penalty outside [TMin, TMax].
func computeTempReward(temp, setpoint, tMin, tMax float64) float64 {
if temp >= tMin && temp <= tMax {
// Gaussian-shaped bonus: maximum at setpoint, degrades toward bounds
deviation := math.Abs(temp - setpoint)
sigma := (tMax - tMin) / 4.0
return math.Exp(-0.5*(deviation/sigma)*(deviation/sigma)) * 1.5 // Increased positive reward
}
// Outside bounds: proportional penalty
excess := math.Max(temp-tMax, tMin-temp)
return -excess * 0.6
}
// computeGridResponse returns a reward for grid-aware behavior:
// bonus for shedding during stress, baseline for readiness, penalty for waste.
func computeGridResponse(stress, shedFraction float64) float64 {
if stress > 0.7 {
// High stress: large bonus proportional to shed fraction
if shedFraction > 0.1 {
return shedFraction * stress * 1.5
}
// High stress but not shedding: penalty
return -0.2 * stress
}
if stress > 0.3 {
// Moderate stress: small bonus for readiness, small bonus for proactive shedding
if shedFraction > 0.05 {
return shedFraction * 0.5 // proactive shedding during moderate stress
}
return 0.08 // grid-aware readiness bonus
}
// Low stress: mild penalty for unnecessary shedding, baseline for normal operation
if shedFraction > 0.1 {
return -shedFraction * 0.3
}
return 0.1 // small positive signal for operating normally under low stress
}
// computeArbitrageBonus rewards storage use when current price is low vs recent history
// (causal: uses only past prices, no future curve leakage).
func computeArbitrageBonus(chargeRate, currentPrice float64, curve []float64, step int) float64 {
lookBack := 8
pastSum := 0.0
count := 0
for i := step - lookBack; i < step && i >= 0; i++ {
pastSum += curve[i]
count++
}
if count == 0 {
return 0.0
}
pastAvg := pastSum / float64(count)
if chargeRate > 0 && currentPrice < pastAvg {
return chargeRate * (pastAvg - currentPrice) * 2.0
}
if chargeRate < 0 && currentPrice > pastAvg {
return math.Abs(chargeRate) * (currentPrice - pastAvg) * 2.0
}
return 0.0
}
// computeFaultMitigationReward returns reward/penalty for proper fault response behavior.
// Tracks Track 3 (fault handling) in the hackathon theme.
func computeFaultMitigationReward(b *BuildingState, activeFaults []FaultEvent) float64 {
if len(activeFaults) == 0 {
return 0.0
}
score := 0.0
for _, fault := range activeFaults {
switch fault.Type {
case FaultGridOutage:
// Reward for shedding load during grid outage
// High load_shed_fraction = good. Low = bad.
if b.LoadShedFraction > 0.5 {
score += 0.3 * b.LoadShedFraction
} else {
score -= 0.2
}
case FaultChillerFailure:
// Reward for reducing HVAC during chiller fault
hvacLevel := b.PrevHVACLevel
if hvacLevel < 0.4 {
score += 0.2
} else {
score -= 0.15
}
}
}
// Critical penalty: building 0 overheating during any fault
if b.BuildingID == 0 && b.IndoorTemperature > 28.0 && len(activeFaults) > 0 {
score -= 0.5
}
return math.Max(-0.5, math.Min(0.3, score))
}