Compare commits
4 Commits
f899edb57f
...
2ecc34ceb9
Author | SHA1 | Date |
---|---|---|
|
2ecc34ceb9 | |
|
8598e80718 | |
|
6fa3e5f9ad | |
|
b7bd13570f |
|
@ -1,6 +1,6 @@
|
||||||
# @package _global_
|
# @package _global_
|
||||||
|
|
||||||
fps: 20
|
fps: 400
|
||||||
|
|
||||||
env:
|
env:
|
||||||
name: maniskill/pushcube
|
name: maniskill/pushcube
|
||||||
|
|
|
@ -8,22 +8,23 @@
|
||||||
# env.gym.obs_type=environment_state_agent_pos \
|
# env.gym.obs_type=environment_state_agent_pos \
|
||||||
|
|
||||||
seed: 1
|
seed: 1
|
||||||
dataset_repo_id: "AdilZtn/Maniskill-Pushcube-demonstration-medium"
|
# dataset_repo_id: "AdilZtn/Maniskill-Pushcube-demonstration-medium"
|
||||||
|
dataset_repo_id: null
|
||||||
|
|
||||||
training:
|
training:
|
||||||
# Offline training dataloader
|
# Offline training dataloader
|
||||||
num_workers: 4
|
num_workers: 4
|
||||||
|
|
||||||
batch_size: 512
|
batch_size: 512
|
||||||
grad_clip_norm: 10.0
|
grad_clip_norm: 40.0
|
||||||
lr: 3e-4
|
lr: 3e-4
|
||||||
|
|
||||||
|
|
||||||
storage_device: "cpu"
|
storage_device: "cuda"
|
||||||
|
|
||||||
eval_freq: 2500
|
eval_freq: 2500
|
||||||
log_freq: 10
|
log_freq: 10
|
||||||
save_freq: 2000000
|
save_freq: 1000000
|
||||||
|
|
||||||
online_steps: 1000000
|
online_steps: 1000000
|
||||||
online_rollout_n_episodes: 10
|
online_rollout_n_episodes: 10
|
||||||
|
@ -32,17 +33,12 @@ training:
|
||||||
online_sampling_ratio: 1.0
|
online_sampling_ratio: 1.0
|
||||||
online_env_seed: 10000
|
online_env_seed: 10000
|
||||||
online_buffer_capacity: 200000
|
online_buffer_capacity: 200000
|
||||||
|
offline_buffer_capacity: 100000
|
||||||
online_buffer_seed_size: 0
|
online_buffer_seed_size: 0
|
||||||
online_step_before_learning: 500
|
online_step_before_learning: 500
|
||||||
do_online_rollout_async: false
|
do_online_rollout_async: false
|
||||||
policy_update_freq: 1
|
policy_update_freq: 1
|
||||||
|
|
||||||
# delta_timestamps:
|
|
||||||
# observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
|
|
||||||
# observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
|
|
||||||
# action: "[i / ${fps} for i in range(${policy.horizon})]"
|
|
||||||
# next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
|
|
||||||
|
|
||||||
policy:
|
policy:
|
||||||
name: sac
|
name: sac
|
||||||
|
|
||||||
|
@ -68,28 +64,33 @@ policy:
|
||||||
camera_number: 1
|
camera_number: 1
|
||||||
|
|
||||||
# Normalization / Unnormalization
|
# Normalization / Unnormalization
|
||||||
input_normalization_modes: null
|
# input_normalization_modes: null
|
||||||
# input_normalization_modes:
|
input_normalization_modes:
|
||||||
# observation.state: min_max
|
observation.state: min_max
|
||||||
input_normalization_params: null
|
observation.image: mean_std
|
||||||
# observation.state:
|
# input_normalization_params: null
|
||||||
# min: [-1.9361e+00, -7.7640e-01, -7.7094e-01, -2.9709e+00, -8.5656e-01,
|
input_normalization_params:
|
||||||
# 1.0764e+00, -1.2680e+00, 0.0000e+00, 0.0000e+00, -9.3448e+00,
|
observation.state:
|
||||||
# -3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00,
|
min: [-1.9361e+00, -7.7640e-01, -7.7094e-01, -2.9709e+00, -8.5656e-01,
|
||||||
# -6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
|
1.0764e+00, -1.2680e+00, 0.0000e+00, 0.0000e+00, -9.3448e+00,
|
||||||
# 8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01]
|
-3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00,
|
||||||
|
-6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
|
||||||
|
8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01]
|
||||||
|
max: [ 0.8644, 1.4306, 1.8520, -0.7578, 0.9508, 3.4901, 1.9381, 0.0400,
|
||||||
|
0.0400, 5.0885, 4.7156, 7.9393, 7.9100, 2.9796, 5.7720, 4.7163,
|
||||||
|
7.8145, 9.7415, 0.2422, 0.4505, 0.6306, 0.2622, 1.0000, 0.5135,
|
||||||
|
0.4001]
|
||||||
|
|
||||||
# max: [ 0.8644, 1.4306, 1.8520, -0.7578, 0.9508, 3.4901, 1.9381, 0.0400,
|
observation.image:
|
||||||
# 0.0400, 5.0885, 4.7156, 7.9393, 7.9100, 2.9796, 5.7720, 4.7163,
|
mean: [0.485, 0.456, 0.406]
|
||||||
# 7.8145, 9.7415, 0.2422, 0.4505, 0.6306, 0.2622, 1.0000, 0.5135,
|
std: [0.229, 0.224, 0.225]
|
||||||
# 0.4001]
|
|
||||||
|
|
||||||
output_normalization_modes:
|
output_normalization_modes:
|
||||||
action: min_max
|
action: min_max
|
||||||
output_normalization_params:
|
output_normalization_params:
|
||||||
action:
|
action:
|
||||||
min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
|
min: [-0.03, -0.03, -0.03, -0.03, -0.03, -0.03, -0.03]
|
||||||
max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
|
max: [0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03]
|
||||||
output_normalization_shapes:
|
output_normalization_shapes:
|
||||||
action: [7]
|
action: [7]
|
||||||
|
|
||||||
|
@ -99,8 +100,8 @@ policy:
|
||||||
# discount: 0.99
|
# discount: 0.99
|
||||||
discount: 0.80
|
discount: 0.80
|
||||||
temperature_init: 1.0
|
temperature_init: 1.0
|
||||||
num_critics: 10 #10
|
num_critics: 2 #10
|
||||||
num_subsample_critics: 2
|
num_subsample_critics: null
|
||||||
critic_lr: 3e-4
|
critic_lr: 3e-4
|
||||||
actor_lr: 3e-4
|
actor_lr: 3e-4
|
||||||
temperature_lr: 3e-4
|
temperature_lr: 3e-4
|
||||||
|
@ -111,7 +112,7 @@ policy:
|
||||||
actor_learner_config:
|
actor_learner_config:
|
||||||
learner_host: "127.0.0.1"
|
learner_host: "127.0.0.1"
|
||||||
learner_port: 50051
|
learner_port: 50051
|
||||||
policy_parameters_push_frequency: 1
|
policy_parameters_push_frequency: 4
|
||||||
concurrency:
|
concurrency:
|
||||||
actor: 'processes'
|
actor: 'threads'
|
||||||
learner: 'processes'
|
learner: 'threads'
|
||||||
|
|
|
@ -202,6 +202,7 @@ def initialize_offline_replay_buffer(
|
||||||
action_delta=cfg.env.wrapper.delta_action,
|
action_delta=cfg.env.wrapper.delta_action,
|
||||||
storage_device=storage_device,
|
storage_device=storage_device,
|
||||||
optimize_memory=True,
|
optimize_memory=True,
|
||||||
|
capacity=cfg.training.offline_buffer_capacity,
|
||||||
)
|
)
|
||||||
return offline_replay_buffer
|
return offline_replay_buffer
|
||||||
|
|
||||||
|
@ -508,6 +509,22 @@ def add_actor_information_and_train(
|
||||||
resume_interaction_step if resume_interaction_step is not None else 0
|
resume_interaction_step if resume_interaction_step is not None else 0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Extract variables from cfg
|
||||||
|
online_step_before_learning = cfg.training.online_step_before_learning
|
||||||
|
utd_ratio = cfg.policy.utd_ratio
|
||||||
|
dataset_repo_id = cfg.dataset_repo_id
|
||||||
|
fps = cfg.fps
|
||||||
|
log_freq = cfg.training.log_freq
|
||||||
|
save_freq = cfg.training.save_freq
|
||||||
|
device = cfg.device
|
||||||
|
storage_device = cfg.training.storage_device
|
||||||
|
policy_update_freq = cfg.training.policy_update_freq
|
||||||
|
policy_parameters_push_frequency = (
|
||||||
|
cfg.actor_learner_config.policy_parameters_push_frequency
|
||||||
|
)
|
||||||
|
save_checkpoint = cfg.training.save_checkpoint
|
||||||
|
online_steps = cfg.training.online_steps
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
if shutdown_event is not None and shutdown_event.is_set():
|
if shutdown_event is not None and shutdown_event.is_set():
|
||||||
logging.info("[LEARNER] Shutdown signal received. Exiting...")
|
logging.info("[LEARNER] Shutdown signal received. Exiting...")
|
||||||
|
@ -545,15 +562,15 @@ def add_actor_information_and_train(
|
||||||
|
|
||||||
logging.debug("[LEARNER] Received interactions")
|
logging.debug("[LEARNER] Received interactions")
|
||||||
|
|
||||||
if len(replay_buffer) < cfg.training.online_step_before_learning:
|
if len(replay_buffer) < online_step_before_learning:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logging.debug("[LEARNER] Starting optimization loop")
|
logging.debug("[LEARNER] Starting optimization loop")
|
||||||
time_for_one_optimization_step = time.time()
|
time_for_one_optimization_step = time.time()
|
||||||
for _ in range(cfg.policy.utd_ratio - 1):
|
for _ in range(utd_ratio - 1):
|
||||||
batch = replay_buffer.sample(batch_size)
|
batch = replay_buffer.sample(batch_size)
|
||||||
|
|
||||||
if cfg.dataset_repo_id is not None:
|
if dataset_repo_id is not None:
|
||||||
batch_offline = offline_replay_buffer.sample(batch_size)
|
batch_offline = offline_replay_buffer.sample(batch_size)
|
||||||
batch = concatenate_batch_transitions(batch, batch_offline)
|
batch = concatenate_batch_transitions(batch, batch_offline)
|
||||||
|
|
||||||
|
@ -590,7 +607,7 @@ def add_actor_information_and_train(
|
||||||
|
|
||||||
batch = replay_buffer.sample(batch_size)
|
batch = replay_buffer.sample(batch_size)
|
||||||
|
|
||||||
if cfg.dataset_repo_id is not None:
|
if dataset_repo_id is not None:
|
||||||
batch_offline = offline_replay_buffer.sample(batch_size)
|
batch_offline = offline_replay_buffer.sample(batch_size)
|
||||||
batch = concatenate_batch_transitions(
|
batch = concatenate_batch_transitions(
|
||||||
left_batch_transitions=batch, right_batch_transition=batch_offline
|
left_batch_transitions=batch, right_batch_transition=batch_offline
|
||||||
|
@ -632,8 +649,8 @@ def add_actor_information_and_train(
|
||||||
training_infos["loss_critic"] = loss_critic.item()
|
training_infos["loss_critic"] = loss_critic.item()
|
||||||
training_infos["critic_grad_norm"] = critic_grad_norm
|
training_infos["critic_grad_norm"] = critic_grad_norm
|
||||||
|
|
||||||
if optimization_step % cfg.training.policy_update_freq == 0:
|
if optimization_step % policy_update_freq == 0:
|
||||||
for _ in range(cfg.training.policy_update_freq):
|
for _ in range(policy_update_freq):
|
||||||
loss_actor = policy.compute_loss_actor(
|
loss_actor = policy.compute_loss_actor(
|
||||||
observations=observations,
|
observations=observations,
|
||||||
observation_features=observation_features,
|
observation_features=observation_features,
|
||||||
|
@ -671,15 +688,18 @@ def add_actor_information_and_train(
|
||||||
training_infos["temperature_grad_norm"] = temp_grad_norm
|
training_infos["temperature_grad_norm"] = temp_grad_norm
|
||||||
training_infos["temperature"] = policy.temperature
|
training_infos["temperature"] = policy.temperature
|
||||||
|
|
||||||
if (
|
if time.time() - last_time_policy_pushed > policy_parameters_push_frequency:
|
||||||
time.time() - last_time_policy_pushed
|
|
||||||
> cfg.actor_learner_config.policy_parameters_push_frequency
|
|
||||||
):
|
|
||||||
push_actor_policy_to_queue(parameters_queue, policy)
|
push_actor_policy_to_queue(parameters_queue, policy)
|
||||||
last_time_policy_pushed = time.time()
|
last_time_policy_pushed = time.time()
|
||||||
|
|
||||||
policy.update_target_networks()
|
policy.update_target_networks()
|
||||||
if optimization_step % cfg.training.log_freq == 0:
|
|
||||||
|
if optimization_step % log_freq == 0:
|
||||||
|
training_infos["replay_buffer_size"] = len(replay_buffer)
|
||||||
|
if offline_replay_buffer is not None:
|
||||||
|
training_infos["offline_replay_buffer_size"] = len(
|
||||||
|
offline_replay_buffer
|
||||||
|
)
|
||||||
training_infos["Optimization step"] = optimization_step
|
training_infos["Optimization step"] = optimization_step
|
||||||
logger.log_dict(
|
logger.log_dict(
|
||||||
d=training_infos, mode="train", custom_step_key="Optimization step"
|
d=training_infos, mode="train", custom_step_key="Optimization step"
|
||||||
|
@ -705,17 +725,14 @@ def add_actor_information_and_train(
|
||||||
)
|
)
|
||||||
|
|
||||||
optimization_step += 1
|
optimization_step += 1
|
||||||
if optimization_step % cfg.training.log_freq == 0:
|
if optimization_step % log_freq == 0:
|
||||||
logging.info(f"[LEARNER] Number of optimization step: {optimization_step}")
|
logging.info(f"[LEARNER] Number of optimization step: {optimization_step}")
|
||||||
|
|
||||||
if cfg.training.save_checkpoint and (
|
if save_checkpoint and (
|
||||||
optimization_step % cfg.training.save_freq == 0
|
optimization_step % save_freq == 0 or optimization_step == online_steps
|
||||||
or optimization_step == cfg.training.online_steps
|
|
||||||
):
|
):
|
||||||
logging.info(f"Checkpoint policy after step {optimization_step}")
|
logging.info(f"Checkpoint policy after step {optimization_step}")
|
||||||
# Note: Save with step as the identifier, and format it to have at least 6 digits but more if
|
_num_digits = max(6, len(str(online_steps)))
|
||||||
# needed (choose 6 as a minimum for consistency without being overkill).
|
|
||||||
_num_digits = max(6, len(str(cfg.training.online_steps)))
|
|
||||||
step_identifier = f"{optimization_step:0{_num_digits}d}"
|
step_identifier = f"{optimization_step:0{_num_digits}d}"
|
||||||
interaction_step = (
|
interaction_step = (
|
||||||
interaction_message["Interaction step"]
|
interaction_message["Interaction step"]
|
||||||
|
@ -739,7 +756,7 @@ def add_actor_information_and_train(
|
||||||
dataset_dir,
|
dataset_dir,
|
||||||
)
|
)
|
||||||
replay_buffer.to_lerobot_dataset(
|
replay_buffer.to_lerobot_dataset(
|
||||||
cfg.dataset_repo_id, fps=cfg.fps, root=logger.log_dir / "dataset"
|
dataset_repo_id, fps=fps, root=logger.log_dir / "dataset"
|
||||||
)
|
)
|
||||||
if offline_replay_buffer is not None:
|
if offline_replay_buffer is not None:
|
||||||
dataset_dir = logger.log_dir / "dataset_offline"
|
dataset_dir = logger.log_dir / "dataset_offline"
|
||||||
|
|
|
@ -159,7 +159,7 @@ def make_maniskill(
|
||||||
env.unwrapped.metadata["render_fps"] = 20
|
env.unwrapped.metadata["render_fps"] = 20
|
||||||
env = ManiSkillCompat(env)
|
env = ManiSkillCompat(env)
|
||||||
env = ManiSkillActionWrapper(env)
|
env = ManiSkillActionWrapper(env)
|
||||||
env = ManiSkillMultiplyActionWrapper(env, multiply_factor=1)
|
env = ManiSkillMultiplyActionWrapper(env, multiply_factor=0.03)
|
||||||
|
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue