rishiad
/

default_submission

Model card Files Files and versions

xet

Community

rishiad commited on Aug 30, 2025

Commit

d36955c

unverified ·

1 Parent(s): 7fdb0a1

enhance: add policy scaling and improved debug output in RLAgent

Browse files

Files changed (1) hide show

agent.py +38 -28

agent.py CHANGED Viewed

@@ -37,10 +37,21 @@ class RLAgent(AgentInterface):
         self.policy = SawyerReachV3Policy()
         print("Successfully initialized SawyerReachV3Policy")
         # Track episode state
         self.episode_step = 0
         self.max_episode_steps = kwargs.get("max_episode_steps", 200)
         # Debug flags
         self.debug_observations = True
         self.debug_actions = True
@@ -59,32 +70,29 @@ class RLAgent(AgentInterface):
             action: Action tensor to take in the environment
         """
         try:
-            # Debug observation structure
-            if self.debug_observations and self.episode_step % 20 == 0:
-                print(f"Raw observation structure: {type(obs)}")
-                if isinstance(obs, dict):
-                    print(f"Observation keys: {list(obs.keys())}")
-                    for key, value in obs.items():
-                        if isinstance(value, np.ndarray):
-                            print(f"  {key}: shape={value.shape}, dtype={value.dtype}")
-                        else:
-                            print(f"  {key}: {type(value)} = {value}")
             # Process observation to extract the format needed by the expert policy
             processed_obs = self._process_observation(obs)
-            # Debug processed observation
-            if self.debug_observations and self.episode_step % 20 == 0:
-                print(f"Processed obs: shape={processed_obs.shape}, dtype={processed_obs.dtype}")
-                print(f"Processed obs sample: {processed_obs[:10]}...")  # First 10 values
             # Use the expert policy
             action_numpy = self.policy.get_action(processed_obs)
-            # Debug raw policy output
-            if self.debug_actions and self.episode_step % 20 == 0:
-                print(f"Raw policy action: {action_numpy}, type: {type(action_numpy)}")
-                print(f"Action shape: {np.array(action_numpy).shape}")
             # Convert to tensor
             if isinstance(action_numpy, (list, tuple)):
@@ -92,6 +100,12 @@ class RLAgent(AgentInterface):
             else:
                 action_tensor = torch.from_numpy(np.array(action_numpy)).float()
             # Ensure correct action dimensionality
             if self.action_space and hasattr(self.action_space, 'shape'):
                 expected_shape = self.action_space.shape[0]
@@ -104,9 +118,8 @@ class RLAgent(AgentInterface):
                     else:
                         action_tensor = action_tensor[:expected_shape]
-            # Debug final action
-            if self.debug_actions and self.episode_step % 20 == 0:
-                print(f"Final action tensor: {action_tensor}")
             self.episode_step += 1
             return action_tensor
@@ -139,8 +152,7 @@ class RLAgent(AgentInterface):
             for key in possible_keys:
                 if key in obs:
                     processed_obs = obs[key]
-                    if self.debug_observations and self.episode_step % 50 == 0:
-                        print(f"Using observation key: {key}")
                     break
             if processed_obs is None:
@@ -150,13 +162,11 @@ class RLAgent(AgentInterface):
                     if isinstance(value, (np.ndarray, list, tuple)):
                         flat_value = np.array(value).flatten()
                         numeric_values.append(flat_value)
-                        if self.debug_observations and self.episode_step % 50 == 0:
-                            print(f"Concatenating key {key}: shape={flat_value.shape}")
                 if numeric_values:
                     processed_obs = np.concatenate(numeric_values)
-                    if self.debug_observations and self.episode_step % 50 == 0:
-                        print(f"Concatenated observation shape: {processed_obs.shape}")
                 else:
                     # Last resort: use first value
                     processed_obs = next(iter(obs.values()))

         self.policy = SawyerReachV3Policy()
         print("Successfully initialized SawyerReachV3Policy")
+        # Check if policy has any scaling attributes that might need adjustment
+        if hasattr(self.policy, 'action_space'):
+            print(f"Policy action space: {self.policy.action_space}")
+        if hasattr(self.policy, 'scale'):
+            print(f"Policy scale: {self.policy.scale}")
+        if hasattr(self.policy, 'bias'):
+            print(f"Policy bias: {self.policy.bias}")
         # Track episode state
         self.episode_step = 0
         self.max_episode_steps = kwargs.get("max_episode_steps", 200)
+        # Policy scaling factor (can be adjusted if policy constants are too high)
+        self.policy_scale = kwargs.get("policy_scale", 1.0)
         # Debug flags
         self.debug_observations = True
         self.debug_actions = True
             action: Action tensor to take in the environment
         """
         try:
+            # Debug observation structure (reduced frequency)
+            print(f"Raw observation structure: {type(obs)}")
+            if isinstance(obs, dict):
+                print(f"Observation keys: {list(obs.keys())}")
+                for key, value in obs.items():
+                    if isinstance(value, np.ndarray):
+                        print(f"  {key}: shape={value.shape}, dtype={value.dtype}")
+                    else:
+                        print(f"  {key}: {type(value)} = {value}")
             # Process observation to extract the format needed by the expert policy
             processed_obs = self._process_observation(obs)
+            # Debug processed observation (reduced frequency)
+            print(f"Processed obs: shape={processed_obs.shape}, dtype={processed_obs.dtype}")
+            print(f"Processed obs sample: {processed_obs[:10]}...")  # First 10 values
             # Use the expert policy
             action_numpy = self.policy.get_action(processed_obs)
+            # Debug raw policy output (reduced frequency)
+            print(f"Raw policy action: {action_numpy}, type: {type(action_numpy)}")
+            print(f"Action shape: {np.array(action_numpy).shape}")
             # Convert to tensor
             if isinstance(action_numpy, (list, tuple)):
             else:
                 action_tensor = torch.from_numpy(np.array(action_numpy)).float()
+            # Apply scaling factor if needed (helps with policy constants that may be too high)
+            action_tensor = action_tensor * self.policy_scale
+            # Clip actions to [-1, 1] range to handle policy constants that may be too high
+            action_tensor = torch.clamp(action_tensor, -1.0, 1.0)
             # Ensure correct action dimensionality
             if self.action_space and hasattr(self.action_space, 'shape'):
                 expected_shape = self.action_space.shape[0]
                     else:
                         action_tensor = action_tensor[:expected_shape]
+            # Debug final action (reduced frequency)
+            print(f"Final action tensor: {action_tensor}")
             self.episode_step += 1
             return action_tensor
             for key in possible_keys:
                 if key in obs:
                     processed_obs = obs[key]
+                    print(f"Using observation key: {key}")
                     break
             if processed_obs is None:
                     if isinstance(value, (np.ndarray, list, tuple)):
                         flat_value = np.array(value).flatten()
                         numeric_values.append(flat_value)
+                        print(f"Concatenating key {key}: shape={flat_value.shape}")
                 if numeric_values:
                     processed_obs = np.concatenate(numeric_values)
+                    print(f"Concatenated observation shape: {processed_obs.shape}")
                 else:
                     # Last resort: use first value
                     processed_obs = next(iter(obs.values()))