diff --git a/lerobot/common/policies/act/configuration_act.py b/lerobot/common/policies/act/configuration_act.py
index be904425..282df3b0 100644
--- a/lerobot/common/policies/act/configuration_act.py
+++ b/lerobot/common/policies/act/configuration_act.py
@@ -30,15 +30,11 @@ class ActionChunkingTransformerConfig:
             The key represents the output data name, and the value is a list indicating the dimensions
             of the corresponding data. For example, "action" refers to an output shape of [14], indicating
             14-dimensional actions. Importantly, shapes doesnt include batch dimension or temporal dimension.
-        normalize_input_modes: A dictionary specifying the normalization mode to be applied to various inputs.
-            The key represents the input data name, and the value specifies the type of normalization to apply.
-            Common normalization methods include "mean_std" (mean and standard deviation) or "min_max" (to normalize
-            between -1 and 1).
-        unnormalize_output_modes: A dictionary specifying the method to unnormalize outputs.
-            This parameter maps output data types to their unnormalization modes, allowing the results to be
-            transformed back from a normalized state to a standard state. It is typically used when output
-            data needs to be interpreted in its original scale or units. For example, for "action", the
-            unnormalization mode might be "mean_std" or "min_max".
+        normalize_input_modes: A dictionary with key represents the modality (e.g. "observation.state"),
+            and the value specifies the normalization mode to apply. The two availables
+            modes are "mean_std" which substracts the mean and divide by the standard
+            deviation and "min_max" which rescale in a [-1, 1] range.
+        unnormalize_output_modes: Similar dictionary as `normalize_input_modes`, but to unormalize in original scale.
         vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
         use_pretrained_backbone: Whether the backbone should be initialized with pretrained weights from
             torchvision.
@@ -65,7 +61,7 @@ class ActionChunkingTransformerConfig:
     """
 
     # Environment.
-    # TODO(rcadene, alexander-soar): remove these as they are defined in input_shapes, output_shapes
+    # TODO(rcadene, alexander-soare): remove these as they are defined in input_shapes, output_shapes
     state_dim: int = 14
     action_dim: int = 14
 
@@ -75,13 +71,13 @@ class ActionChunkingTransformerConfig:
     chunk_size: int = 100
     n_action_steps: int = 100
 
-    input_shapes: dict[str, str] = field(
+    input_shapes: dict[str, list[str]] = field(
         default_factory=lambda: {
             "observation.images.top": [3, 480, 640],
             "observation.state": [14],
         }
     )
-    output_shapes: dict[str, str] = field(
+    output_shapes: dict[str, list[str]] = field(
         default_factory=lambda: {
             "action": [14],
         }
diff --git a/lerobot/common/policies/act/modeling_act.py b/lerobot/common/policies/act/modeling_act.py
index 3682598f..869ecd7b 100644
--- a/lerobot/common/policies/act/modeling_act.py
+++ b/lerobot/common/policies/act/modeling_act.py
@@ -72,8 +72,6 @@ class ActionChunkingTransformerPolicy(nn.Module):
         if cfg is None:
             cfg = ActionChunkingTransformerConfig()
         self.cfg = cfg
-        self.normalize_input_modes = cfg.normalize_input_modes
-        self.unnormalize_output_modes = cfg.unnormalize_output_modes
         self.normalize_inputs = Normalize(cfg.input_shapes, cfg.normalize_input_modes, dataset_stats)
         self.unnormalize_outputs = Unnormalize(cfg.output_shapes, cfg.unnormalize_output_modes, dataset_stats)
 
diff --git a/lerobot/common/policies/diffusion/configuration_diffusion.py b/lerobot/common/policies/diffusion/configuration_diffusion.py
index 79652342..789a737b 100644
--- a/lerobot/common/policies/diffusion/configuration_diffusion.py
+++ b/lerobot/common/policies/diffusion/configuration_diffusion.py
@@ -28,15 +28,11 @@ class DiffusionConfig:
             The key represents the output data name, and the value is a list indicating the dimensions
             of the corresponding data. For example, "action" refers to an output shape of [14], indicating
             14-dimensional actions. Importantly, shapes doesnt include batch dimension or temporal dimension.
-        normalize_input_modes: A dictionary specifying the normalization mode to be applied to various inputs.
-            The key represents the input data name, and the value specifies the type of normalization to apply.
-            Common normalization methods include "mean_std" (mean and standard deviation) or "min_max" (to normalize
-            between -1 and 1).
-        unnormalize_output_modes: A dictionary specifying the method to unnormalize outputs.
-            This parameter maps output data types to their unnormalization modes, allowing the results to be
-            transformed back from a normalized state to a standard state. It is typically used when output
-            data needs to be interpreted in its original scale or units. For example, for "action", the
-            unnormalization mode might be "mean_std" or "min_max".
+        normalize_input_modes: A dictionary with key represents the modality (e.g. "observation.state"),
+            and the value specifies the normalization mode to apply. The two availables
+            modes are "mean_std" which substracts the mean and divide by the standard
+            deviation and "min_max" which rescale in a [-1, 1] range.
+        unnormalize_output_modes: Similar dictionary as `normalize_input_modes`, but to unormalize in original scale.
         vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
         crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit
             within the image size. If None, no cropping is done.
@@ -74,7 +70,7 @@ class DiffusionConfig:
 
     # Environment.
     # Inherit these from the environment config.
-    # TODO(rcadene, alexander-soar): remove these as they are defined in input_shapes, output_shapes
+    # TODO(rcadene, alexander-soare): remove these as they are defined in input_shapes, output_shapes
     state_dim: int = 2
     action_dim: int = 2
     image_size: tuple[int, int] = (96, 96)
@@ -84,13 +80,13 @@ class DiffusionConfig:
     horizon: int = 16
     n_action_steps: int = 8
 
-    input_shapes: dict[str, str] = field(
+    input_shapes: dict[str, list[str]] = field(
         default_factory=lambda: {
             "observation.image": [3, 96, 96],
             "observation.state": [2],
         }
     )
-    output_shapes: dict[str, str] = field(
+    output_shapes: dict[str, list[str]] = field(
         default_factory=lambda: {
             "action": [2],
         }
diff --git a/lerobot/common/policies/diffusion/modeling_diffusion.py b/lerobot/common/policies/diffusion/modeling_diffusion.py
index 4bedf373..088b6cb6 100644
--- a/lerobot/common/policies/diffusion/modeling_diffusion.py
+++ b/lerobot/common/policies/diffusion/modeling_diffusion.py
@@ -56,8 +56,6 @@ class DiffusionPolicy(nn.Module):
         if cfg is None:
             cfg = DiffusionConfig()
         self.cfg = cfg
-        self.normalize_input_modes = cfg.normalize_input_modes
-        self.unnormalize_output_modes = cfg.unnormalize_output_modes
         self.normalize_inputs = Normalize(cfg.input_shapes, cfg.normalize_input_modes, dataset_stats)
         self.unnormalize_outputs = Unnormalize(cfg.output_shapes, cfg.unnormalize_output_modes, dataset_stats)
 
diff --git a/lerobot/common/policies/normalize.py b/lerobot/common/policies/normalize.py
index f61066eb..3809ae74 100644
--- a/lerobot/common/policies/normalize.py
+++ b/lerobot/common/policies/normalize.py
@@ -31,18 +31,24 @@ def create_stats_buffers(shapes, modes, stats=None):
     for key, mode in modes.items():
         assert mode in ["mean_std", "min_max"]
 
-        shape = shapes[key]
+        shape = tuple(shapes[key])
 
-        # override shape to be invariant to height and width
         if "image" in key:
-            # assume shape is channel first (b, c, h, w) or (b, t, c, h, w)
-            shape[-1] = 1
-            shape[-2] = 1
+            # sanity checks
+            assert len(shape) == 3, f"number of dimensions of {key} != 3 ({shape=}"
+            c, h, w = shape
+            assert c < h and c < w, f"{key} is not channel first ({shape=})"
+            # override image shape to be invariant to height and width
+            shape = (c, 1, 1)
+
+        # Note: we initialize mean, std, min, max to infinity. They should be overwritten
+        # downstream by `stats` or `policy.load_state_dict`, as expected. During forward,
+        # we assert they are not infinity anymore.
 
         buffer = {}
         if mode == "mean_std":
-            mean = torch.zeros(shape, dtype=torch.float32)
-            std = torch.ones(shape, dtype=torch.float32)
+            mean = torch.ones(shape, dtype=torch.float32) * torch.inf
+            std = torch.ones(shape, dtype=torch.float32) * torch.inf
             buffer = nn.ParameterDict(
                 {
                     "mean": nn.Parameter(mean, requires_grad=False),
@@ -50,9 +56,8 @@ def create_stats_buffers(shapes, modes, stats=None):
                 }
             )
         elif mode == "min_max":
-            # TODO(rcadene): should we assume input is in [-1, 1] range?
-            min = torch.ones(shape, dtype=torch.float32) * -1
-            max = torch.ones(shape, dtype=torch.float32)
+            min = torch.ones(shape, dtype=torch.float32) * torch.inf
+            max = torch.ones(shape, dtype=torch.float32) * torch.inf
             buffer = nn.ParameterDict(
                 {
                     "min": nn.Parameter(min, requires_grad=False),
@@ -109,12 +114,24 @@ class Normalize(nn.Module):
             buffer = getattr(self, "buffer_" + key.replace(".", "_"))
 
             if mode == "mean_std":
-                mean = buffer["mean"].unsqueeze(0)
-                std = buffer["std"].unsqueeze(0)
+                mean = buffer["mean"]
+                std = buffer["std"]
+                assert not torch.isinf(
+                    mean
+                ).any(), "`mean` is infinity. You forgot to initialize with `stats` as argument, or called `policy.load_state_dict`."
+                assert not torch.isinf(
+                    std
+                ).any(), "`std` is infinity. You forgot to initialize with `stats` as argument, or called `policy.load_state_dict`."
                 batch[key] = (batch[key] - mean) / (std + 1e-8)
             elif mode == "min_max":
-                min = buffer["min"].unsqueeze(0)
-                max = buffer["max"].unsqueeze(0)
+                min = buffer["min"]
+                max = buffer["max"]
+                assert not torch.isinf(
+                    min
+                ).any(), "`min` is infinity. You forgot to initialize with `stats` as argument, or called `policy.load_state_dict`."
+                assert not torch.isinf(
+                    max
+                ).any(), "`max` is infinity. You forgot to initialize with `stats` as argument, or called `policy.load_state_dict`."
                 # normalize to [0,1]
                 batch[key] = (batch[key] - min) / (max - min)
                 # normalize to [-1, 1]
@@ -131,8 +148,8 @@ class Unnormalize(nn.Module):
     The class is initialized with a set of shapes, modes, and optional pre-defined statistics. It creates buffers for unnormalization based
     on these inputs, which are then used to adjust data during the forward pass. The unnormalization process operates on a batch of data,
     with different keys in the batch being normalized according to the specified modes. The following unnormalization modes are supported:
-    - "mean_std": Unnormalizes data using the mean and standard deviation.
-    - "min_max": Unnormalizes data to a [0, 1] range and then to a [-1, 1] range.
+    - "mean_std": Subtracts the mean and divides by the standard deviation.
+    - "min_max": Scales and offsets the data such that the minimum is -1 and the maximum is +1.
 
     Parameters:
         shapes (dict): A dictionary where keys represent tensor identifiers and values represent the shapes of those tensors.
@@ -161,12 +178,24 @@ class Unnormalize(nn.Module):
             buffer = getattr(self, "buffer_" + key.replace(".", "_"))
 
             if mode == "mean_std":
-                mean = buffer["mean"].unsqueeze(0)
-                std = buffer["std"].unsqueeze(0)
+                mean = buffer["mean"]
+                std = buffer["std"]
+                assert not torch.isinf(
+                    mean
+                ).any(), "`mean` is infinity. You forgot to initialize with `stats` as argument, or called `policy.load_state_dict`."
+                assert not torch.isinf(
+                    std
+                ).any(), "`std` is infinity. You forgot to initialize with `stats` as argument, or called `policy.load_state_dict`."
                 batch[key] = batch[key] * std + mean
             elif mode == "min_max":
-                min = buffer["min"].unsqueeze(0)
-                max = buffer["max"].unsqueeze(0)
+                min = buffer["min"]
+                max = buffer["max"]
+                assert not torch.isinf(
+                    min
+                ).any(), "`min` is infinity. You forgot to initialize with `stats` as argument, or called `policy.load_state_dict`."
+                assert not torch.isinf(
+                    max
+                ).any(), "`max` is infinity. You forgot to initialize with `stats` as argument, or called `policy.load_state_dict`."
                 batch[key] = (batch[key] + 1) / 2
                 batch[key] = batch[key] * (max - min) + min
             else:
diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
index 6fd7467f..69b65011 100644
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -35,7 +35,7 @@ policy:
   n_action_steps: 100
 
   input_shapes:
-    # TODO(rcadene, alexander-soar): add variables for height and width from the dataset/env?
+    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
     observation.images.top: [3, 480, 640]
     observation.state: ["${policy.state_dim}"]
   output_shapes:
diff --git a/lerobot/configs/policy/diffusion.yaml b/lerobot/configs/policy/diffusion.yaml
index d769413e..45e27d2c 100644
--- a/lerobot/configs/policy/diffusion.yaml
+++ b/lerobot/configs/policy/diffusion.yaml
@@ -51,7 +51,7 @@ policy:
   n_action_steps: ${n_action_steps}
 
   input_shapes:
-    # TODO(rcadene, alexander-soar): add variables for height and width from the dataset/env?
+    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
     observation.image: [3, 96, 96]
     observation.state: ["${policy.state_dim}"]
   output_shapes:
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 1033ae8f..c849cce8 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -339,7 +339,6 @@ def train(cfg: dict, out_dir=None, job_name=None):
             eval_info = eval_policy(
                 rollout_env,
                 policy,
-                transform=offline_dataset.transform,
                 return_episode_data=True,
                 seed=cfg.seed,
             )
diff --git a/tests/test_policies.py b/tests/test_policies.py
index 2e8242e1..69f3422e 100644
--- a/tests/test_policies.py
+++ b/tests/test_policies.py
@@ -96,9 +96,8 @@ def test_policy(env_name, policy_name, extra_overrides):
 
     # Test load state_dict
     if policy_name != "tdmpc":
-        # TODO(rcadene, alexander-soar): make it work for tdmpc
-        # TODO(rcadene, alexander-soar): how to remove need for dataset_stats?
-        new_policy = make_policy(cfg, dataset_stats=dataset.stats)
+        # TODO(rcadene, alexander-soare): make it work for tdmpc
+        new_policy = make_policy(cfg)
         new_policy.load_state_dict(policy.state_dict())
 
 
@@ -110,7 +109,7 @@ def test_policy(env_name, policy_name, extra_overrides):
     ],
 )
 def test_normalize(insert_temporal_dim):
-    # TODO(rcadene, alexander-soar): test with real data and assert results of normalization/unnormalization
+    # TODO(rcadene, alexander-soare): test with real data and assert results of normalization/unnormalization
 
     input_shapes = {
         "observation.image": [3, 96, 96],
@@ -170,7 +169,8 @@ def test_normalize(insert_temporal_dim):
 
     # test without stats
     normalize = Normalize(input_shapes, normalize_input_modes, stats=None)
-    normalize(input_batch)
+    with pytest.raises(AssertionError):
+        normalize(input_batch)
 
     # test with stats
     normalize = Normalize(input_shapes, normalize_input_modes, stats=dataset_stats)
@@ -183,7 +183,8 @@ def test_normalize(insert_temporal_dim):
 
     # test wihtout stats
     unnormalize = Unnormalize(output_shapes, unnormalize_output_modes, stats=None)
-    unnormalize(output_batch)
+    with pytest.raises(AssertionError):
+        unnormalize(output_batch)
 
     # test with stats
     unnormalize = Unnormalize(output_shapes, unnormalize_output_modes, stats=dataset_stats)