Spaces:

Prajwal782007
/

Gridmind

Running

App Files Files Community

Gridmind / env /rewards.go

adityss

feat: implement multi-component dense reward function and environmental logic for GridMind-RL

b81683f about 2 months ago

raw

history blame contribute delete

11.7 kB

	// Package env implements the multi-component dense reward function for GridMind-RL.
	package env

	import "math"

	// ComputeRewardInput bundles all inputs needed to compute the reward for one step.
	type ComputeRewardInput struct {
	B *BuildingState
	Act ActionModel
	StepCost float64 // $ cost incurred this step
	EnergyKWh float64 // kWh consumed this step
	TMin float64 // lower temperature bound (°C)
	TMax float64 // upper temperature bound (°C)
	StepCarbon float64 // gCO2 emitted this step
	BatchMissed int // number of batch jobs that missed deadline this step
	GridStress float64 // 0.0–1.0 grid stress signal
	ShedFraction float64 // clamped load shed fraction
	TaskID int // 1, 2, 3, or 4
	PrevHVACLevel float64 // previous step's HVAC power level (for stability)
	ChargeRate float64 // current thermal charge rate
	PrevChargeRate float64 // previous step's thermal charge rate
	StorageDelta float64 // change in storage level (+ = charging)
	PriceCurve []float64 // full episode price curve for arbitrage calc
	CurrentStep int // current step index
	InstructionCard *InstructionCard // non-nil for Task 4 episodes
	ActiveFaults []FaultEvent // currently active fault events for Track 3
	}

	// ComputeReward returns a dense RewardComponents struct from the current step inputs.
	// All 7 reward components are always computed for rich per-step signal.
	// Task-specific weighting is handled by the GRADING system (tasks.go), not here.
	func ComputeReward(inp ComputeRewardInput) RewardComponents {
	rc := RewardComponents{}

	// ── 1. Cost Savings ─────────────────────────────────────────────────────
	// Positive baseline minus relative cost: smart agents save money.
	typicalCost := 4.0
	rc.CostSavings = 1.5 - (inp.StepCost/typicalCost)*2.0

	// ── 2. Temperature Constraint ────────────────────────────────────────────
	// Gaussian bonus for being near setpoint; penalty outside comfort bounds.
	temp := inp.B.IndoorTemperature
	rc.TempConstraint = computeTempReward(temp, inp.B.SetpointTemperature, inp.TMin, inp.TMax)

	// ── 3. Grid Stress Response ──────────────────────────────────────────────
	// Rewards proactive grid awareness and demand-response compliance.
	rc.GridResponse = computeGridResponse(inp.GridStress, inp.ShedFraction)

	// ── 4. Deadline Penalty / Bonus ──────────────────────────────────────────
	// Penalise missed batch jobs, reward on-track pending jobs.
	if inp.BatchMissed > 0 {
	rc.DeadlinePenalty = -float64(inp.BatchMissed) * 1.5
	}
	// Positive signal: reward for jobs still on track (not missed yet)
	onTrackJobs := 0
	for _, job := range inp.B.Jobs {
	if !job.Completed && !job.MissedDeadline {
	onTrackJobs++
	}
	if job.Completed && !job.MissedDeadline {
	onTrackJobs++ // completed on time is even better
	}
	}
	if onTrackJobs > 0 && inp.BatchMissed == 0 {
	rc.DeadlinePenalty += float64(onTrackJobs) * 0.08
	}

	// ── 5. Efficiency Bonus (thermal storage utilization) ─────────────────────
	// Rewards smart storage use: arbitrage + maintaining useful storage levels.
	if len(inp.PriceCurve) > inp.CurrentStep {
	rc.EfficiencyBonus = computeArbitrageBonus(
	inp.ChargeRate,
	inp.PriceCurve[inp.CurrentStep],
	inp.PriceCurve,
	inp.CurrentStep,
	)
	}
	// Baseline: reward maintaining a balanced storage level (not empty, not always full)
	storageLevel := inp.B.ThermalStorageLevel
	if storageLevel > 0.2 && storageLevel < 0.85 {
	rc.EfficiencyBonus += 0.15 // good operating range
	} else if storageLevel <= 0.05 \|\| storageLevel >= 0.98 {
	rc.EfficiencyBonus -= 0.1 // extremes are wasteful
	}

	// ── 6. Stability Reward/Penalty ──────────────────────────────────────────
	// Smooth operation earns a bonus; rapid oscillation earns a penalty.
	hvacDelta := math.Abs(inp.Act.HVACPowerLevel - inp.PrevHVACLevel)
	chargeDelta := math.Abs(inp.ChargeRate - inp.PrevChargeRate)
	oscillation := hvacDelta0.5 + chargeDelta0.3
	if oscillation > 0.3 {
	rc.StabilityPenalty = -(oscillation - 0.3) * 0.8
	} else {
	// Positive reward for smooth, stable control
	rc.StabilityPenalty = (0.3 - oscillation) * 0.4
	}

	// ── 7. Carbon Reward ─────────────────────────────────────────────────────
	// Rewards low-carbon operation based on grid carbon intensity.
	carbonNorm := math.Max(0, (inp.B.CarbonIntensity-100.0)/600.0)
	// Baseline bonus, reduced by carbon-heavy consumption
	rc.CarbonReward = 0.6 - (inp.EnergyKWh * carbonNorm * 0.25)
	// Extra bonus for operating during genuinely clean grid periods
	if carbonNorm < 0.3 {
	rc.CarbonReward += 0.15
	}

	// ── 8. Instruction-Following Reward (Task 4 only) ─────────────────────────
	if inp.TaskID == 4 && inp.InstructionCard != nil {
	rc.InstructionReward = computeInstructionReward(inp.InstructionCard, inp.B, inp.ShedFraction, inp.GridStress)
	}

	// ── 9. Fault Mitigation Reward (Track 3) ──────────────────────────────
	if len(inp.ActiveFaults) > 0 {
	rc.FaultMitigation = computeFaultMitigationReward(inp.B, inp.ActiveFaults)
	}

	// ── Aggregate ────────────────────────────────────────────────────────────
	// Total is the sum of all 9 reward components. Each component is computed
	// independently above and contributes directly to the total signal.
	rc.Total = rc.CostSavings + rc.TempConstraint + rc.GridResponse +
	rc.DeadlinePenalty + rc.EfficiencyBonus + rc.StabilityPenalty + rc.CarbonReward +
	rc.InstructionReward + rc.FaultMitigation

	return rc
	}

	// computeInstructionReward scores per-step progress against the instruction card targets.
	// Returns a value in roughly [-0.5, 1.0] depending on how well the agent tracks targets.
	func computeInstructionReward(card InstructionCard, b BuildingState, shedFraction, gridStress float64) float64 {
	if card == nil {
	return 0.0
	}

	score := 0.0
	weight := card.Weights["task_completion"]
	if weight == 0 {
	weight = 0.5
	}

	components := 0
	total := 0.0

	// KPI: energy cost cap
	if maxCost, ok := card.Targets["max_cost"]; ok && maxCost > 0 {
	components++
	if b.CumulativeCost <= maxCost {
	total += 1.0 // on track
	} else {
	// Proportional penalty for how far over budget we are
	overRatio := (b.CumulativeCost - maxCost) / maxCost
	total += math.Max(-1.0, -overRatio)
	}
	}

	// KPI: temperature bounds
	if tMin, okMin := card.Targets["t_min"]; okMin {
	if tMax, okMax := card.Targets["t_max"]; okMax {
	components++
	temp := b.IndoorTemperature
	if temp >= tMin && temp <= tMax {
	total += 1.0
	} else {
	excess := math.Max(temp-tMax, tMin-temp)
	total += math.Max(-1.0, -excess*0.3)
	}
	}
	}

	// KPI: minimum load shed during grid stress
	if minShed, ok := card.Targets["min_shed_fraction"]; ok {
	components++
	if gridStress > 0.7 {
	if shedFraction >= minShed {
	total += 1.0
	} else {
	total += (shedFraction / minShed) - 1.0 // partial credit
	}
	} else {
	total += 0.5 // no stress event this step — neutral
	}
	}

	// KPI: carbon reduction (vs baseline, approximated by carbon intensity signal)
	if _, ok := card.Targets["carbon_reduction"]; ok {
	components++
	// Proxy: reward operating when carbon intensity is low
	carbonNorm := math.Max(0, (b.CarbonIntensity-100.0)/600.0)
	if carbonNorm < 0.4 {
	total += 1.0
	} else {
	total += 1.0 - carbonNorm
	}
	}

	if components == 0 {
	return 0.0
	}
	score = (total / float64(components)) * weight
	return math.Max(-0.5, math.Min(1.0, score))
	}

	// computeTempReward returns a reward based on how close the indoor temperature
	// is to the setpoint, with a hard penalty outside [TMin, TMax].
	func computeTempReward(temp, setpoint, tMin, tMax float64) float64 {
	if temp >= tMin && temp <= tMax {
	// Gaussian-shaped bonus: maximum at setpoint, degrades toward bounds
	deviation := math.Abs(temp - setpoint)
	sigma := (tMax - tMin) / 4.0
	return math.Exp(-0.5(deviation/sigma)(deviation/sigma)) * 1.5 // Increased positive reward
	}
	// Outside bounds: proportional penalty
	excess := math.Max(temp-tMax, tMin-temp)
	return -excess * 0.6
	}

	// computeGridResponse returns a reward for grid-aware behavior:
	// bonus for shedding during stress, baseline for readiness, penalty for waste.
	func computeGridResponse(stress, shedFraction float64) float64 {
	if stress > 0.7 {
	// High stress: large bonus proportional to shed fraction
	if shedFraction > 0.1 {
	return shedFraction * stress * 1.5
	}
	// High stress but not shedding: penalty
	return -0.2 * stress
	}
	if stress > 0.3 {
	// Moderate stress: small bonus for readiness, small bonus for proactive shedding
	if shedFraction > 0.05 {
	return shedFraction * 0.5 // proactive shedding during moderate stress
	}
	return 0.08 // grid-aware readiness bonus
	}
	// Low stress: mild penalty for unnecessary shedding, baseline for normal operation
	if shedFraction > 0.1 {
	return -shedFraction * 0.3
	}
	return 0.1 // small positive signal for operating normally under low stress
	}

	// computeArbitrageBonus rewards storage use when current price is low vs recent history
	// (causal: uses only past prices, no future curve leakage).
	func computeArbitrageBonus(chargeRate, currentPrice float64, curve []float64, step int) float64 {
	lookBack := 8
	pastSum := 0.0
	count := 0
	for i := step - lookBack; i < step && i >= 0; i++ {
	pastSum += curve[i]
	count++
	}
	if count == 0 {
	return 0.0
	}
	pastAvg := pastSum / float64(count)

	if chargeRate > 0 && currentPrice < pastAvg {
	return chargeRate * (pastAvg - currentPrice) * 2.0
	}
	if chargeRate < 0 && currentPrice > pastAvg {
	return math.Abs(chargeRate) * (currentPrice - pastAvg) * 2.0
	}
	return 0.0
	}

	// computeFaultMitigationReward returns reward/penalty for proper fault response behavior.
	// Tracks Track 3 (fault handling) in the hackathon theme.
	func computeFaultMitigationReward(b *BuildingState, activeFaults []FaultEvent) float64 {
	if len(activeFaults) == 0 {
	return 0.0
	}

	score := 0.0
	for _, fault := range activeFaults {
	switch fault.Type {
	case FaultGridOutage:
	// Reward for shedding load during grid outage
	// High load_shed_fraction = good. Low = bad.
	if b.LoadShedFraction > 0.5 {
	score += 0.3 * b.LoadShedFraction
	} else {
	score -= 0.2
	}
	case FaultChillerFailure:
	// Reward for reducing HVAC during chiller fault
	hvacLevel := b.PrevHVACLevel
	if hvacLevel < 0.4 {
	score += 0.2
	} else {
	score -= 0.15
	}
	}
	}

	// Critical penalty: building 0 overheating during any fault
	if b.BuildingID == 0 && b.IndoorTemperature > 28.0 && len(activeFaults) > 0 {
	score -= 0.5
	}

	return math.Max(-0.5, math.Min(0.3, score))
	}