// Package env implements the multi-component dense reward function for GridMind-RL.
package env

import "math"

// ComputeRewardInput bundles all inputs needed to compute the reward for one step.
type ComputeRewardInput struct {
	B               *BuildingState
	Act             ActionModel
	StepCost        float64          // $ cost incurred this step
	EnergyKWh       float64          // kWh consumed this step
	TMin            float64          // lower temperature bound (°C)
	TMax            float64          // upper temperature bound (°C)
	StepCarbon      float64          // gCO2 emitted this step
	BatchMissed     int              // number of batch jobs that missed deadline this step
	GridStress      float64          // 0.0–1.0 grid stress signal
	ShedFraction    float64          // clamped load shed fraction
	TaskID          int              // 1, 2, 3, or 4
	PrevHVACLevel   float64          // previous step's HVAC power level (for stability)
	ChargeRate      float64          // current thermal charge rate
	PrevChargeRate  float64          // previous step's thermal charge rate
	StorageDelta    float64          // change in storage level (+ = charging)
	PriceCurve      []float64        // full episode price curve for arbitrage calc
	CurrentStep     int              // current step index
	InstructionCard *InstructionCard // non-nil for Task 4 episodes
	ActiveFaults    []FaultEvent      // currently active fault events for Track 3
}

// ComputeReward returns a dense RewardComponents struct from the current step inputs.
// All 7 reward components are always computed for rich per-step signal.
// Task-specific weighting is handled by the GRADING system (tasks.go), not here.
func ComputeReward(inp ComputeRewardInput) RewardComponents {
	rc := RewardComponents{}

	// ── 1. Cost Savings ─────────────────────────────────────────────────────
	// Positive baseline minus relative cost: smart agents save money.
	typicalCost := 4.0
	rc.CostSavings = 1.5 - (inp.StepCost/typicalCost)*2.0

	// ── 2. Temperature Constraint ────────────────────────────────────────────
	// Gaussian bonus for being near setpoint; penalty outside comfort bounds.
	temp := inp.B.IndoorTemperature
	rc.TempConstraint = computeTempReward(temp, inp.B.SetpointTemperature, inp.TMin, inp.TMax)

	// ── 3. Grid Stress Response ──────────────────────────────────────────────
	// Rewards proactive grid awareness and demand-response compliance.
	rc.GridResponse = computeGridResponse(inp.GridStress, inp.ShedFraction)

	// ── 4. Deadline Penalty / Bonus ──────────────────────────────────────────
	// Penalise missed batch jobs, reward on-track pending jobs.
	if inp.BatchMissed > 0 {
		rc.DeadlinePenalty = -float64(inp.BatchMissed) * 1.5
	}
	// Positive signal: reward for jobs still on track (not missed yet)
	onTrackJobs := 0
	for _, job := range inp.B.Jobs {
		if !job.Completed && !job.MissedDeadline {
			onTrackJobs++
		}
		if job.Completed && !job.MissedDeadline {
			onTrackJobs++ // completed on time is even better
		}
	}
	if onTrackJobs > 0 && inp.BatchMissed == 0 {
		rc.DeadlinePenalty += float64(onTrackJobs) * 0.08
	}

	// ── 5. Efficiency Bonus (thermal storage utilization) ─────────────────────
	// Rewards smart storage use: arbitrage + maintaining useful storage levels.
	if len(inp.PriceCurve) > inp.CurrentStep {
		rc.EfficiencyBonus = computeArbitrageBonus(
			inp.ChargeRate,
			inp.PriceCurve[inp.CurrentStep],
			inp.PriceCurve,
			inp.CurrentStep,
		)
	}
	// Baseline: reward maintaining a balanced storage level (not empty, not always full)
	storageLevel := inp.B.ThermalStorageLevel
	if storageLevel > 0.2 && storageLevel < 0.85 {
		rc.EfficiencyBonus += 0.15 // good operating range
	} else if storageLevel <= 0.05 || storageLevel >= 0.98 {
		rc.EfficiencyBonus -= 0.1 // extremes are wasteful
	}

	// ── 6. Stability Reward/Penalty ──────────────────────────────────────────
	// Smooth operation earns a bonus; rapid oscillation earns a penalty.
	hvacDelta := math.Abs(inp.Act.HVACPowerLevel - inp.PrevHVACLevel)
	chargeDelta := math.Abs(inp.ChargeRate - inp.PrevChargeRate)
	oscillation := hvacDelta*0.5 + chargeDelta*0.3
	if oscillation > 0.3 {
		rc.StabilityPenalty = -(oscillation - 0.3) * 0.8
	} else {
		// Positive reward for smooth, stable control
		rc.StabilityPenalty = (0.3 - oscillation) * 0.4
	}

	// ── 7. Carbon Reward ─────────────────────────────────────────────────────
	// Rewards low-carbon operation based on grid carbon intensity.
	carbonNorm := math.Max(0, (inp.B.CarbonIntensity-100.0)/600.0)
	// Baseline bonus, reduced by carbon-heavy consumption
	rc.CarbonReward = 0.6 - (inp.EnergyKWh * carbonNorm * 0.25)
	// Extra bonus for operating during genuinely clean grid periods
	if carbonNorm < 0.3 {
		rc.CarbonReward += 0.15
	}

	// ── 8. Instruction-Following Reward (Task 4 only) ─────────────────────────
	if inp.TaskID == 4 && inp.InstructionCard != nil {
		rc.InstructionReward = computeInstructionReward(inp.InstructionCard, inp.B, inp.ShedFraction, inp.GridStress)
	}

	// ── 9. Fault Mitigation Reward (Track 3) ──────────────────────────────
	if len(inp.ActiveFaults) > 0 {
		rc.FaultMitigation = computeFaultMitigationReward(inp.B, inp.ActiveFaults)
	}

	// ── Aggregate ────────────────────────────────────────────────────────────
	// Total is the sum of all 9 reward components. Each component is computed
	// independently above and contributes directly to the total signal.
	rc.Total = rc.CostSavings + rc.TempConstraint + rc.GridResponse +
		rc.DeadlinePenalty + rc.EfficiencyBonus + rc.StabilityPenalty + rc.CarbonReward +
		rc.InstructionReward + rc.FaultMitigation

	return rc
}

// computeInstructionReward scores per-step progress against the instruction card targets.
// Returns a value in roughly [-0.5, 1.0] depending on how well the agent tracks targets.
func computeInstructionReward(card *InstructionCard, b *BuildingState, shedFraction, gridStress float64) float64 {
	if card == nil {
		return 0.0
	}

	score := 0.0
	weight := card.Weights["task_completion"]
	if weight == 0 {
		weight = 0.5
	}

	components := 0
	total := 0.0

	// KPI: energy cost cap
	if maxCost, ok := card.Targets["max_cost"]; ok && maxCost > 0 {
		components++
		if b.CumulativeCost <= maxCost {
			total += 1.0 // on track
		} else {
			// Proportional penalty for how far over budget we are
			overRatio := (b.CumulativeCost - maxCost) / maxCost
			total += math.Max(-1.0, -overRatio)
		}
	}

	// KPI: temperature bounds
	if tMin, okMin := card.Targets["t_min"]; okMin {
		if tMax, okMax := card.Targets["t_max"]; okMax {
			components++
			temp := b.IndoorTemperature
			if temp >= tMin && temp <= tMax {
				total += 1.0
			} else {
				excess := math.Max(temp-tMax, tMin-temp)
				total += math.Max(-1.0, -excess*0.3)
			}
		}
	}

	// KPI: minimum load shed during grid stress
	if minShed, ok := card.Targets["min_shed_fraction"]; ok {
		components++
		if gridStress > 0.7 {
			if shedFraction >= minShed {
				total += 1.0
			} else {
				total += (shedFraction / minShed) - 1.0 // partial credit
			}
		} else {
			total += 0.5 // no stress event this step — neutral
		}
	}

	// KPI: carbon reduction (vs baseline, approximated by carbon intensity signal)
	if _, ok := card.Targets["carbon_reduction"]; ok {
		components++
		// Proxy: reward operating when carbon intensity is low
		carbonNorm := math.Max(0, (b.CarbonIntensity-100.0)/600.0)
		if carbonNorm < 0.4 {
			total += 1.0
		} else {
			total += 1.0 - carbonNorm
		}
	}

	if components == 0 {
		return 0.0
	}
	score = (total / float64(components)) * weight
	return math.Max(-0.5, math.Min(1.0, score))
}

// computeTempReward returns a reward based on how close the indoor temperature
// is to the setpoint, with a hard penalty outside [TMin, TMax].
func computeTempReward(temp, setpoint, tMin, tMax float64) float64 {
	if temp >= tMin && temp <= tMax {
		// Gaussian-shaped bonus: maximum at setpoint, degrades toward bounds
		deviation := math.Abs(temp - setpoint)
		sigma := (tMax - tMin) / 4.0
		return math.Exp(-0.5*(deviation/sigma)*(deviation/sigma)) * 1.5 // Increased positive reward
	}
	// Outside bounds: proportional penalty
	excess := math.Max(temp-tMax, tMin-temp)
	return -excess * 0.6
}

// computeGridResponse returns a reward for grid-aware behavior:
// bonus for shedding during stress, baseline for readiness, penalty for waste.
func computeGridResponse(stress, shedFraction float64) float64 {
	if stress > 0.7 {
		// High stress: large bonus proportional to shed fraction
		if shedFraction > 0.1 {
			return shedFraction * stress * 1.5
		}
		// High stress but not shedding: penalty
		return -0.2 * stress
	}
	if stress > 0.3 {
		// Moderate stress: small bonus for readiness, small bonus for proactive shedding
		if shedFraction > 0.05 {
			return shedFraction * 0.5 // proactive shedding during moderate stress
		}
		return 0.08 // grid-aware readiness bonus
	}
	// Low stress: mild penalty for unnecessary shedding, baseline for normal operation
	if shedFraction > 0.1 {
		return -shedFraction * 0.3
	}
	return 0.1 // small positive signal for operating normally under low stress
}

// computeArbitrageBonus rewards storage use when current price is low vs recent history
// (causal: uses only past prices, no future curve leakage).
func computeArbitrageBonus(chargeRate, currentPrice float64, curve []float64, step int) float64 {
	lookBack := 8
	pastSum := 0.0
	count := 0
	for i := step - lookBack; i < step && i >= 0; i++ {
		pastSum += curve[i]
		count++
	}
	if count == 0 {
		return 0.0
	}
	pastAvg := pastSum / float64(count)

	if chargeRate > 0 && currentPrice < pastAvg {
		return chargeRate * (pastAvg - currentPrice) * 2.0
	}
	if chargeRate < 0 && currentPrice > pastAvg {
		return math.Abs(chargeRate) * (currentPrice - pastAvg) * 2.0
	}
	return 0.0
}

// computeFaultMitigationReward returns reward/penalty for proper fault response behavior.
// Tracks Track 3 (fault handling) in the hackathon theme.
func computeFaultMitigationReward(b *BuildingState, activeFaults []FaultEvent) float64 {
	if len(activeFaults) == 0 {
		return 0.0
	}

	score := 0.0
	for _, fault := range activeFaults {
		switch fault.Type {
		case FaultGridOutage:
			// Reward for shedding load during grid outage
			// High load_shed_fraction = good. Low = bad.
			if b.LoadShedFraction > 0.5 {
				score += 0.3 * b.LoadShedFraction
			} else {
				score -= 0.2
			}
		case FaultChillerFailure:
			// Reward for reducing HVAC during chiller fault
			hvacLevel := b.PrevHVACLevel
			if hvacLevel < 0.4 {
				score += 0.2
			} else {
				score -= 0.15
			}
		}
	}

	// Critical penalty: building 0 overheating during any fault
	if b.BuildingID == 0 && b.IndoorTemperature > 28.0 && len(activeFaults) > 0 {
		score -= 0.5
	}

	return math.Max(-0.5, math.Min(0.3, score))
}