From caf55994ebccd70911b7bb05f25d03685ffcb51b Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 13 Feb 2025 16:42:43 +0100
Subject: [PATCH] Changed the init_final value to center the starting mean and
 std of the policy

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 .../common/policies/hilserl/classifier/modeling_classifier.py | 2 +-
 lerobot/common/policies/sac/configuration_sac.py              | 1 +
 lerobot/scripts/server/actor_server.py                        | 2 +-
 lerobot/scripts/server/gym_manipulator.py                     | 4 ++--
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index a9fbb601..e6700547 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -148,7 +148,7 @@ class Classifier(
     def predict_reward(self, x, threshold=0.6):
         if self.config.num_classes == 2:
             probs = self.forward(x).probabilities
-            logging.info(f"Predicted reward images: {probs}")
+            logging.debug(f"Predicted reward images: {probs}")
             return (probs > threshold).float()
         else:
             return torch.argmax(self.forward(x).probabilities, dim=1)
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index e9d78fdd..18ceee24 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -95,5 +95,6 @@ class SACConfig:
             "use_tanh_squash": True,
             "log_std_min": -5,
             "log_std_max": 2,
+            "init_final": 0.01,
         }
     )
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 2be6674c..6cdc49e3 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -327,7 +327,7 @@ def send_transitions_in_chunks(transitions: list, message_queue, chunk_size: int
 def get_frequency_stats(list_policy_time: list[float]) -> dict[str, float]:
     stats = {}
     list_policy_fps = [1.0 / t for t in list_policy_time]
-    if len(list_policy_fps) > 0:
+    if len(list_policy_fps) > 1:
         policy_fps = mean(list_policy_fps)
         quantiles_90 = quantiles(list_policy_fps, n=10)[-1]
         logging.debug(f"[ACTOR] Average policy frame rate: {policy_fps}")
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index 3ed1fdc9..a43f07ca 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -217,7 +217,7 @@ class HILSerlRobotEnv(gym.Env):
                 if torch.any(teleop_action < -self.delta_relative_bounds_size * self.delta) and torch.any(
                     teleop_action > self.delta_relative_bounds_size
                 ):
-                    print(
+                    logging.debug(
                         f"Relative teleop delta exceeded bounds {self.delta_relative_bounds_size}, teleop_action {teleop_action}\n"
                         f"lower bounds condition {teleop_action < -self.delta_relative_bounds_size}\n"
                         f"upper bounds condition {teleop_action > self.delta_relative_bounds_size}"
@@ -318,7 +318,7 @@ class RewardWrapper(gym.Wrapper):
             )
         info["Reward classifer frequency"] = 1 / (time.perf_counter() - start_time)
 
-        logging.info(f"Reward: {reward}")
+        # logging.info(f"Reward: {reward}")
 
         if reward == 1.0:
             terminated = True