Compare commits

..

No commits in common. "2ecc34ceb90e2c65f16f56ed4fa57511ac632799" and "f899edb57fe3c4bc1caec9a4053aa90f7899e2bc" have entirely different histories.

4 changed files with 52 additions and 70 deletions

View File

@ -1,6 +1,6 @@
# @package _global_ # @package _global_
fps: 400 fps: 20
env: env:
name: maniskill/pushcube name: maniskill/pushcube

View File

@ -8,23 +8,22 @@
# env.gym.obs_type=environment_state_agent_pos \ # env.gym.obs_type=environment_state_agent_pos \
seed: 1 seed: 1
# dataset_repo_id: "AdilZtn/Maniskill-Pushcube-demonstration-medium" dataset_repo_id: "AdilZtn/Maniskill-Pushcube-demonstration-medium"
dataset_repo_id: null
training: training:
# Offline training dataloader # Offline training dataloader
num_workers: 4 num_workers: 4
batch_size: 512 batch_size: 512
grad_clip_norm: 40.0 grad_clip_norm: 10.0
lr: 3e-4 lr: 3e-4
storage_device: "cuda" storage_device: "cpu"
eval_freq: 2500 eval_freq: 2500
log_freq: 10 log_freq: 10
save_freq: 1000000 save_freq: 2000000
online_steps: 1000000 online_steps: 1000000
online_rollout_n_episodes: 10 online_rollout_n_episodes: 10
@ -33,12 +32,17 @@ training:
online_sampling_ratio: 1.0 online_sampling_ratio: 1.0
online_env_seed: 10000 online_env_seed: 10000
online_buffer_capacity: 200000 online_buffer_capacity: 200000
offline_buffer_capacity: 100000
online_buffer_seed_size: 0 online_buffer_seed_size: 0
online_step_before_learning: 500 online_step_before_learning: 500
do_online_rollout_async: false do_online_rollout_async: false
policy_update_freq: 1 policy_update_freq: 1
# delta_timestamps:
# observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
# observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
# action: "[i / ${fps} for i in range(${policy.horizon})]"
# next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
policy: policy:
name: sac name: sac
@ -64,33 +68,28 @@ policy:
camera_number: 1 camera_number: 1
# Normalization / Unnormalization # Normalization / Unnormalization
# input_normalization_modes: null input_normalization_modes: null
input_normalization_modes: # input_normalization_modes:
observation.state: min_max # observation.state: min_max
observation.image: mean_std input_normalization_params: null
# input_normalization_params: null # observation.state:
input_normalization_params: # min: [-1.9361e+00, -7.7640e-01, -7.7094e-01, -2.9709e+00, -8.5656e-01,
observation.state: # 1.0764e+00, -1.2680e+00, 0.0000e+00, 0.0000e+00, -9.3448e+00,
min: [-1.9361e+00, -7.7640e-01, -7.7094e-01, -2.9709e+00, -8.5656e-01, # -3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00,
1.0764e+00, -1.2680e+00, 0.0000e+00, 0.0000e+00, -9.3448e+00, # -6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
-3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00, # 8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01]
-6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01]
max: [ 0.8644, 1.4306, 1.8520, -0.7578, 0.9508, 3.4901, 1.9381, 0.0400,
0.0400, 5.0885, 4.7156, 7.9393, 7.9100, 2.9796, 5.7720, 4.7163,
7.8145, 9.7415, 0.2422, 0.4505, 0.6306, 0.2622, 1.0000, 0.5135,
0.4001]
observation.image: # max: [ 0.8644, 1.4306, 1.8520, -0.7578, 0.9508, 3.4901, 1.9381, 0.0400,
mean: [0.485, 0.456, 0.406] # 0.0400, 5.0885, 4.7156, 7.9393, 7.9100, 2.9796, 5.7720, 4.7163,
std: [0.229, 0.224, 0.225] # 7.8145, 9.7415, 0.2422, 0.4505, 0.6306, 0.2622, 1.0000, 0.5135,
# 0.4001]
output_normalization_modes: output_normalization_modes:
action: min_max action: min_max
output_normalization_params: output_normalization_params:
action: action:
min: [-0.03, -0.03, -0.03, -0.03, -0.03, -0.03, -0.03] min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
max: [0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03] max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
output_normalization_shapes: output_normalization_shapes:
action: [7] action: [7]
@ -100,8 +99,8 @@ policy:
# discount: 0.99 # discount: 0.99
discount: 0.80 discount: 0.80
temperature_init: 1.0 temperature_init: 1.0
num_critics: 2 #10 num_critics: 10 #10
num_subsample_critics: null num_subsample_critics: 2
critic_lr: 3e-4 critic_lr: 3e-4
actor_lr: 3e-4 actor_lr: 3e-4
temperature_lr: 3e-4 temperature_lr: 3e-4
@ -112,7 +111,7 @@ policy:
actor_learner_config: actor_learner_config:
learner_host: "127.0.0.1" learner_host: "127.0.0.1"
learner_port: 50051 learner_port: 50051
policy_parameters_push_frequency: 4 policy_parameters_push_frequency: 1
concurrency: concurrency:
actor: 'threads' actor: 'processes'
learner: 'threads' learner: 'processes'

View File

@ -202,7 +202,6 @@ def initialize_offline_replay_buffer(
action_delta=cfg.env.wrapper.delta_action, action_delta=cfg.env.wrapper.delta_action,
storage_device=storage_device, storage_device=storage_device,
optimize_memory=True, optimize_memory=True,
capacity=cfg.training.offline_buffer_capacity,
) )
return offline_replay_buffer return offline_replay_buffer
@ -509,22 +508,6 @@ def add_actor_information_and_train(
resume_interaction_step if resume_interaction_step is not None else 0 resume_interaction_step if resume_interaction_step is not None else 0
) )
# Extract variables from cfg
online_step_before_learning = cfg.training.online_step_before_learning
utd_ratio = cfg.policy.utd_ratio
dataset_repo_id = cfg.dataset_repo_id
fps = cfg.fps
log_freq = cfg.training.log_freq
save_freq = cfg.training.save_freq
device = cfg.device
storage_device = cfg.training.storage_device
policy_update_freq = cfg.training.policy_update_freq
policy_parameters_push_frequency = (
cfg.actor_learner_config.policy_parameters_push_frequency
)
save_checkpoint = cfg.training.save_checkpoint
online_steps = cfg.training.online_steps
while True: while True:
if shutdown_event is not None and shutdown_event.is_set(): if shutdown_event is not None and shutdown_event.is_set():
logging.info("[LEARNER] Shutdown signal received. Exiting...") logging.info("[LEARNER] Shutdown signal received. Exiting...")
@ -562,15 +545,15 @@ def add_actor_information_and_train(
logging.debug("[LEARNER] Received interactions") logging.debug("[LEARNER] Received interactions")
if len(replay_buffer) < online_step_before_learning: if len(replay_buffer) < cfg.training.online_step_before_learning:
continue continue
logging.debug("[LEARNER] Starting optimization loop") logging.debug("[LEARNER] Starting optimization loop")
time_for_one_optimization_step = time.time() time_for_one_optimization_step = time.time()
for _ in range(utd_ratio - 1): for _ in range(cfg.policy.utd_ratio - 1):
batch = replay_buffer.sample(batch_size) batch = replay_buffer.sample(batch_size)
if dataset_repo_id is not None: if cfg.dataset_repo_id is not None:
batch_offline = offline_replay_buffer.sample(batch_size) batch_offline = offline_replay_buffer.sample(batch_size)
batch = concatenate_batch_transitions(batch, batch_offline) batch = concatenate_batch_transitions(batch, batch_offline)
@ -607,7 +590,7 @@ def add_actor_information_and_train(
batch = replay_buffer.sample(batch_size) batch = replay_buffer.sample(batch_size)
if dataset_repo_id is not None: if cfg.dataset_repo_id is not None:
batch_offline = offline_replay_buffer.sample(batch_size) batch_offline = offline_replay_buffer.sample(batch_size)
batch = concatenate_batch_transitions( batch = concatenate_batch_transitions(
left_batch_transitions=batch, right_batch_transition=batch_offline left_batch_transitions=batch, right_batch_transition=batch_offline
@ -649,8 +632,8 @@ def add_actor_information_and_train(
training_infos["loss_critic"] = loss_critic.item() training_infos["loss_critic"] = loss_critic.item()
training_infos["critic_grad_norm"] = critic_grad_norm training_infos["critic_grad_norm"] = critic_grad_norm
if optimization_step % policy_update_freq == 0: if optimization_step % cfg.training.policy_update_freq == 0:
for _ in range(policy_update_freq): for _ in range(cfg.training.policy_update_freq):
loss_actor = policy.compute_loss_actor( loss_actor = policy.compute_loss_actor(
observations=observations, observations=observations,
observation_features=observation_features, observation_features=observation_features,
@ -688,18 +671,15 @@ def add_actor_information_and_train(
training_infos["temperature_grad_norm"] = temp_grad_norm training_infos["temperature_grad_norm"] = temp_grad_norm
training_infos["temperature"] = policy.temperature training_infos["temperature"] = policy.temperature
if time.time() - last_time_policy_pushed > policy_parameters_push_frequency: if (
time.time() - last_time_policy_pushed
> cfg.actor_learner_config.policy_parameters_push_frequency
):
push_actor_policy_to_queue(parameters_queue, policy) push_actor_policy_to_queue(parameters_queue, policy)
last_time_policy_pushed = time.time() last_time_policy_pushed = time.time()
policy.update_target_networks() policy.update_target_networks()
if optimization_step % cfg.training.log_freq == 0:
if optimization_step % log_freq == 0:
training_infos["replay_buffer_size"] = len(replay_buffer)
if offline_replay_buffer is not None:
training_infos["offline_replay_buffer_size"] = len(
offline_replay_buffer
)
training_infos["Optimization step"] = optimization_step training_infos["Optimization step"] = optimization_step
logger.log_dict( logger.log_dict(
d=training_infos, mode="train", custom_step_key="Optimization step" d=training_infos, mode="train", custom_step_key="Optimization step"
@ -725,14 +705,17 @@ def add_actor_information_and_train(
) )
optimization_step += 1 optimization_step += 1
if optimization_step % log_freq == 0: if optimization_step % cfg.training.log_freq == 0:
logging.info(f"[LEARNER] Number of optimization step: {optimization_step}") logging.info(f"[LEARNER] Number of optimization step: {optimization_step}")
if save_checkpoint and ( if cfg.training.save_checkpoint and (
optimization_step % save_freq == 0 or optimization_step == online_steps optimization_step % cfg.training.save_freq == 0
or optimization_step == cfg.training.online_steps
): ):
logging.info(f"Checkpoint policy after step {optimization_step}") logging.info(f"Checkpoint policy after step {optimization_step}")
_num_digits = max(6, len(str(online_steps))) # Note: Save with step as the identifier, and format it to have at least 6 digits but more if
# needed (choose 6 as a minimum for consistency without being overkill).
_num_digits = max(6, len(str(cfg.training.online_steps)))
step_identifier = f"{optimization_step:0{_num_digits}d}" step_identifier = f"{optimization_step:0{_num_digits}d}"
interaction_step = ( interaction_step = (
interaction_message["Interaction step"] interaction_message["Interaction step"]
@ -756,7 +739,7 @@ def add_actor_information_and_train(
dataset_dir, dataset_dir,
) )
replay_buffer.to_lerobot_dataset( replay_buffer.to_lerobot_dataset(
dataset_repo_id, fps=fps, root=logger.log_dir / "dataset" cfg.dataset_repo_id, fps=cfg.fps, root=logger.log_dir / "dataset"
) )
if offline_replay_buffer is not None: if offline_replay_buffer is not None:
dataset_dir = logger.log_dir / "dataset_offline" dataset_dir = logger.log_dir / "dataset_offline"

View File

@ -159,7 +159,7 @@ def make_maniskill(
env.unwrapped.metadata["render_fps"] = 20 env.unwrapped.metadata["render_fps"] = 20
env = ManiSkillCompat(env) env = ManiSkillCompat(env)
env = ManiSkillActionWrapper(env) env = ManiSkillActionWrapper(env)
env = ManiSkillMultiplyActionWrapper(env, multiply_factor=0.03) env = ManiSkillMultiplyActionWrapper(env, multiply_factor=1)
return env return env