From 34e639978169c63f34f7cb6e95ab10f3700cb1fb Mon Sep 17 00:00:00 2001
From: Jerry Xu <xhsyjerry@hotmail.com>
Date: Sun, 26 May 2024 16:12:42 -0400
Subject: [PATCH] velocity loss added

---
 .../legged_gym/envs/a1/a1_leap_config.py      | 11 ++++---
 .../envs/base/legged_robot_field.py           |  4 ++-
 legged_gym/legged_gym/scripts/play_vel.py     | 12 +++----
 rsl_rl/rsl_rl/algorithms/ppo.py               | 33 ++++++++++++-------
 rsl_rl/rsl_rl/runners/on_policy_runner.py     |  4 +++
 5 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/legged_gym/legged_gym/envs/a1/a1_leap_config.py b/legged_gym/legged_gym/envs/a1/a1_leap_config.py
index 1c9ffd3..7963d51 100644
--- a/legged_gym/legged_gym/envs/a1/a1_leap_config.py
+++ b/legged_gym/legged_gym/envs/a1/a1_leap_config.py
@@ -11,7 +11,7 @@ class A1LeapCfg( A1FieldCfg ):
     #         latency_range = [0.04-0.0025, 0.04+0.0075]
     #### uncomment the above to train non-virtual terrain
     class env(A1FieldCfg.env):
-        num_envs = 4
+        num_envs = 4096
     class terrain( A1FieldCfg.terrain ):
         max_init_terrain_level = 2
         border_size = 5
@@ -37,7 +37,7 @@ class A1LeapCfg( A1FieldCfg ):
     
     class commands( A1FieldCfg.commands ):
         class ranges( A1FieldCfg.commands.ranges ):
-            lin_vel_x = [1.0, 4.0]
+            lin_vel_x = [1.5, 3.0]
             lin_vel_y = [0.0, 0.0]
             ang_vel_yaw = [0., 0.]
 
@@ -63,11 +63,12 @@ class A1LeapCfg( A1FieldCfg ):
             tracking_ang_vel = 0.05
             world_vel_l2norm = -1.
             legs_energy_substeps = -1e-6
-            alive = 2.
+            alive = 1. # 2.
             penetrate_depth = -4e-3
             penetrate_volume = -4e-3
             exceed_dof_pos_limits = -1e-1
             exceed_torque_limits_i = -2e-1
+            lin_pos_x = 1.
             # track_predict_vel_l2norm = -1.
         soft_dof_pos_limit = 0.9
 
@@ -82,7 +83,7 @@ class A1LeapCfgPPO( A1FieldCfgPPO ):
     class algorithm( A1FieldCfgPPO.algorithm ):
         entropy_coef = 0.0
         clip_min_std = 0.2
-        lin_vel_x = [2.0, 3.0]
+        lin_vel_x = [0.5, 3.0]
         command_scale = 2.0
     
     class runner( A1FieldCfgPPO.runner ):
@@ -101,7 +102,7 @@ class A1LeapCfgPPO( A1FieldCfgPPO ):
         resume = True
         # load_run = "{Your traind walking model directory}"
         # load_run = "May16_18-12-08_WalkingBase_pEnergySubsteps2e-5_aScale0.5"
-        load_run = "High_speed_walk"
+        load_run = "Leap_2m_2500"
         # load_run = "May15_21-34-27_Skillleap_pEnergySubsteps-1e-06_virtual"#"May15_17-07-38_WalkingBase_pEnergySubsteps2e-5_aScale0.5"
         # load_run = "{Your virtually trained leap model directory}"
         max_iterations = 20000
diff --git a/legged_gym/legged_gym/envs/base/legged_robot_field.py b/legged_gym/legged_gym/envs/base/legged_robot_field.py
index 1004ade..be46710 100644
--- a/legged_gym/legged_gym/envs/base/legged_robot_field.py
+++ b/legged_gym/legged_gym/envs/base/legged_robot_field.py
@@ -1037,7 +1037,9 @@ class LeggedRobotField(LeggedRobot):
         world_vel_error = torch.sum(torch.square(self.commands[:, :2] - self.root_states[:, 7:9]), dim= 1)
         return (1 - torch.exp(-world_vel_error/self.cfg.rewards.tracking_sigma)) * engaging_mask # reverse version of tracking reward
 
-
+    def _reward_lin_pos_x(self):
+        return torch.abs((self.root_states[:, :3] - self.env_origins)[:, 0])
+    
     ##### Some helper functions that override parent class attributes #####
     @property
     def all_obs_components(self):
diff --git a/legged_gym/legged_gym/scripts/play_vel.py b/legged_gym/legged_gym/scripts/play_vel.py
index b330b65..e2f16b0 100644
--- a/legged_gym/legged_gym/scripts/play_vel.py
+++ b/legged_gym/legged_gym/scripts/play_vel.py
@@ -106,14 +106,14 @@ def play(args):
         # "tilt",
     ]
     env_cfg.terrain.BarrierTrack_kwargs["leap"] = dict(
-            length= (1.3, 1.3),
+            length= (1.5, 1.5),
             depth= (0.4, 0.8),
             height= 0.2,
     )
     
     if "one_obstacle_per_track" in env_cfg.terrain.BarrierTrack_kwargs.keys():
         env_cfg.terrain.BarrierTrack_kwargs.pop("one_obstacle_per_track")
-    env_cfg.terrain.BarrierTrack_kwargs["n_obstacles_per_track"] = 2# 2
+    env_cfg.terrain.BarrierTrack_kwargs["n_obstacles_per_track"] = 1# 2
     env_cfg.commands.ranges.lin_vel_x = [3.0, 3.0] # [1.2, 1.2]
     env_cfg.terrain.BarrierTrack_kwargs['track_block_length']= 3.
     if "distill" in args.task:
@@ -239,11 +239,11 @@ def play(args):
         if "obs_slice" in locals().keys():
             obs_component = obs[:, obs_slice[0]].reshape(-1, *obs_slice[1])
             print(obs_component[robot_index])
-        vel_obs = torch.cat([obs[:, :9], obs[:, 12:]], dim=1)
+        vel_obs = torch.cat([obs[..., :9], obs[..., 12:]], dim=-1)
         velocity = velocity_planner(vel_obs)
-        print(velocity)
-        print(env_cfg.commands.ranges.lin_vel_x)
-        velocity = torch.clip(velocity, env_cfg.commands.ranges.lin_vel_x[0], env_cfg.commands.ranges.lin_vel_x[1])
+        env.commands[..., 0] = velocity.squeeze(-1)
+        obs[..., 9] = velocity.squeeze(-1) * env.obs_scales.lin_vel
+        # velocity = torch.clip(velocity, env_cfg.commands.ranges.lin_vel_x[0], env_cfg.commands.ranges.lin_vel_x[1])
         actions = policy(obs.detach())
         teacher_actions = actions
         obs, critic_obs, rews, dones, infos = env.step(actions.detach(), velocity)
diff --git a/rsl_rl/rsl_rl/algorithms/ppo.py b/rsl_rl/rsl_rl/algorithms/ppo.py
index 95c3ea1..db11708 100644
--- a/rsl_rl/rsl_rl/algorithms/ppo.py
+++ b/rsl_rl/rsl_rl/algorithms/ppo.py
@@ -32,6 +32,7 @@ from collections import defaultdict
 import torch
 import torch.nn as nn
 import torch.optim as optim
+import numpy as np
 import copy
 
 from rsl_rl.modules import ActorCritic
@@ -99,9 +100,11 @@ class PPO:
 
     def test_mode(self):
         self.actor_critic.test()
+        self.velocity_planner.eval()
     
     def train_mode(self):
         self.actor_critic.train()
+        self.velocity_planner.train()
 
     def act(self, obs, critic_obs):
         if self.actor_critic.is_recurrent:
@@ -109,10 +112,10 @@ class PPO:
         # Compute the actions and values
         vel_obs = torch.cat([obs[..., :9], obs[..., 12:]], dim=-1)
         velocity = self.velocity_planner(vel_obs)
-        if self.lin_vel_x is not None:
-            velocity = torch.clip(velocity, self.lin_vel_x[0], self.lin_vel_x[1])
-        velocity *= self.command_scale
-        self.transition.actions = self.actor_critic.act(obs, velocity=velocity)[0].detach()
+        # if self.lin_vel_x is not None:
+        #     velocity = torch.clip(velocity, self.lin_vel_x[0], self.lin_vel_x[1])
+        self.transition.actions = self.actor_critic.act(obs, velocity=velocity * self.command_scale)[0].detach()
+        critic_obs[..., 9] = velocity.squeeze(-1) * self.command_scale
         self.transition.values = self.actor_critic.evaluate(critic_obs).detach()
         self.transition.actions_log_prob = self.actor_critic.get_actions_log_prob(self.transition.actions).detach()
         self.transition.action_mean = self.actor_critic.action_mean.detach()
@@ -148,7 +151,7 @@ class PPO:
             generator = self.storage.mini_batch_generator(self.num_mini_batches, self.num_learning_epochs)
         for minibatch in generator:
 
-                losses, _, stats = self.compute_losses(minibatch)
+                losses, _, stats = self.compute_losses(minibatch, current_learning_iteration=current_learning_iteration)
 
                 loss = 0.
                 for k, v in losses.items():
@@ -177,15 +180,15 @@ class PPO:
 
         return mean_losses, average_stats
 
-    def compute_losses(self, minibatch):
+    def compute_losses(self, minibatch, current_learning_iteration=None):
         obs = copy.deepcopy(minibatch.obs)
-        # print(obs.shape)
+
         vel_obs = torch.cat([obs[..., :9], obs[..., 12:]], dim=-1)
-        # print(vel_obs.shape)
+
         velocity = self.velocity_planner(vel_obs)
-        if self.lin_vel_x is not None:
-            velocity = torch.clip(velocity, self.lin_vel_x[0], self.lin_vel_x[1])
-        self.actor_critic.act(obs, masks=minibatch.masks, hidden_states=minibatch.hid_states[0], velocity=velocity)
+        # if self.lin_vel_x is not None:
+        #     velocity = torch.clip(velocity, self.lin_vel_x[0], self.lin_vel_x[1])
+        self.actor_critic.act(obs, masks=minibatch.masks, hidden_states=minibatch.hid_states[0], velocity=velocity * self.command_scale)
         actions_log_prob_batch = self.actor_critic.get_actions_log_prob(minibatch.actions)
         value_batch = self.actor_critic.evaluate(obs, masks=minibatch.masks, hidden_states=minibatch.hid_states[1])
         mu_batch = self.actor_critic.action_mean
@@ -228,9 +231,17 @@ class PPO:
         else:
             value_loss = (minibatch.returns - value_batch).pow(2).mean()
         
+        # Velocity loss
+        if current_learning_iteration is None:
+            vel_loss = 0
+        else:
+            vel_loss = torch.square(velocity-2).mean() * np.exp(-0.01 * current_learning_iteration + 125)
+            vel_loss += torch.square(torch.clamp_max(velocity, 1.) - 1).mean()
+
         return_ = dict(
             surrogate_loss= surrogate_loss,
             value_loss= value_loss,
+            vel_loss = vel_loss
         )
         if entropy_batch is not None:
             return_["entropy"] = - entropy_batch.mean()
diff --git a/rsl_rl/rsl_rl/runners/on_policy_runner.py b/rsl_rl/rsl_rl/runners/on_policy_runner.py
index 94a400c..c609309 100644
--- a/rsl_rl/rsl_rl/runners/on_policy_runner.py
+++ b/rsl_rl/rsl_rl/runners/on_policy_runner.py
@@ -221,6 +221,7 @@ class OnPolicyRunner:
                             'collection_time']:.3f}s, learning {locs['learn_time']:.3f}s)\n"""
                           f"""{'Value function loss:':>{pad}} {locs["losses"]['value_loss']:.4f}\n"""
                           f"""{'Surrogate loss:':>{pad}} {locs["losses"]['surrogate_loss']:.4f}\n"""
+                          f"""{'Velocity loss:':>{pad}} {locs["losses"]['vel_loss']:.4f}\n"""
                           f"""{'Mean action noise std:':>{pad}} {mean_std.item():.2f}\n"""
                         #   f"""{'Mean reward/step:':>{pad}} {locs['mean_reward']:.2f}\n"""
                         #   f"""{'Mean episode length/episode:':>{pad}} {locs['mean_trajectory_length']:.2f}\n"""
@@ -238,6 +239,7 @@ class OnPolicyRunner:
     def save(self, path, infos=None):
         run_state_dict = {
             'model_state_dict': self.alg.actor_critic.state_dict(),
+            'velocity_planner_state_dict': self.alg.velocity_planner.state_dict(),
             'optimizer_state_dict': self.alg.optimizer.state_dict(),
             'velocity_optimizer_state_dict': self.alg.velocity_optimizer.state_dict(),
             'iter': self.current_learning_iteration,
@@ -250,6 +252,8 @@ class OnPolicyRunner:
     def load(self, path, load_optimizer=True):
         loaded_dict = torch.load(path)
         self.alg.actor_critic.load_state_dict(loaded_dict['model_state_dict'])
+        if 'velocity_planner_state_dict' in loaded_dict:
+            self.alg.velocity_planner.load_state_dict(loaded_dict['velocity_planner_state_dict'])
         if load_optimizer and "optimizer_state_dict" in loaded_dict:
             self.alg.optimizer.load_state_dict(loaded_dict['optimizer_state_dict'], )
         if load_optimizer and "velocity_optimizer_state_dict" in loaded_dict: