From eb530fa59503d20d7c11076210e86aa887a8a36b Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Thu, 16 May 2024 19:31:57 +0200
Subject: [PATCH] Add '--independent' flag

---
 lerobot/scripts/compare_policies.py | 73 +++++++++++++++++------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/lerobot/scripts/compare_policies.py b/lerobot/scripts/compare_policies.py
index 15c34798..11e49f95 100644
--- a/lerobot/scripts/compare_policies.py
+++ b/lerobot/scripts/compare_policies.py
@@ -11,8 +11,16 @@ python lerobot/scripts/compare_policies.py \
     output/eval/new_policy/eval_info.json
 ```
 
-This script can accept `eval_info.json` dicts with identical seeds between each eval episode of ref_policy and new_policy
-(paired-samples) or from evals performed with different seeds (independent samples).
+This script can accept `eval_info.json` dicts with identical seeds between each eval episode of ref_policy and
+new_policy (paired-samples) or from evals performed with different seeds (independent samples).
+
+The script will first perform normality tests to determine if parametric tests can be used or not, then
+evaluate if policies metrics are significantly different using the appropriate tests.
+
+CAVEATS: by default, this script will compare seeds numbers to determine if samples can be considered paired.
+If changes have been made to this environment in-between the ref_policy eval and the new_policy eval, you
+should use the `--independent` flag to override this and not pair the samples even if they have identical
+seeds.
 """
 
 import argparse
@@ -23,12 +31,12 @@ from pathlib import Path
 import matplotlib.pyplot as plt
 import numpy as np
 import scipy.stats as stats
-from scipy.stats import anderson, kstest, mannwhitneyu, shapiro, ttest_ind, ttest_rel, wilcoxon
+from scipy.stats import anderson, kstest, mannwhitneyu, normaltest, shapiro, ttest_ind, ttest_rel, wilcoxon
 from statsmodels.stats.contingency_tables import mcnemar
 from termcolor import colored
 
 
-def init_logging(output_dir: Path) -> None:
+def init_logging() -> None:
     logging.basicConfig(
         level=logging.INFO,
         format="%(message)s",
@@ -42,6 +50,18 @@ def log_section(title: str) -> None:
     logging.info(section_title)
 
 
+def log_test(msg: str, p_value: float):
+    if p_value < 0.01:
+        color, interpretation = "red", "H_0 Rejected"
+    elif 0.01 <= p_value < 0.05:
+        color, interpretation = "yellow", "Inconclusive"
+    else:
+        color, interpretation = "green", "H_0 Not Rejected"
+    logging.info(
+        f"{msg}, p-value = {colored(f'{p_value:.3f}', color)} -> {colored(f'{interpretation}', color, attrs=['bold'])}"
+    )
+
+
 def get_eval_info_episodes(eval_info_path: Path) -> dict:
     with open(eval_info_path) as f:
         eval_info = json.load(f)
@@ -55,7 +75,7 @@ def get_eval_info_episodes(eval_info_path: Path) -> dict:
     }
 
 
-def descriptive_stats(ref_sample: dict, new_sample: dict, metric_name: str):
+def describe_samples(ref_sample: dict, new_sample: dict, metric_name: str):
     ref_mean, ref_std = np.mean(ref_sample[metric_name]), np.std(ref_sample[metric_name])
     new_mean, new_std = np.mean(new_sample[metric_name]), np.std(new_sample[metric_name])
     logging.info(f"{metric_name} - Ref sample: mean = {ref_mean:.3f}, std = {ref_std:.3f}")
@@ -67,18 +87,20 @@ def cohens_d(x, y):
 
 
 def normality_tests(array: np.ndarray, name: str):
-    shapiro_stat, shapiro_p = shapiro(array)
+    ap_stat, ap_p = normaltest(array)
+    sw_stat, sw_p = shapiro(array)
     ks_stat, ks_p = kstest(array, "norm", args=(np.mean(array), np.std(array)))
     ad_stat = anderson(array)
 
-    log_test(f"{name} - Shapiro-Wilk Test: statistic = {shapiro_stat:.3f}", shapiro_p)
-    log_test(f"{name} - Kolmogorov-Smirnov Test: statistic = {ks_stat:.3f}", ks_p)
-    logging.info(f"{name} - Anderson-Darling Test: statistic = {ad_stat.statistic:.3f}")
+    log_test(f"{name} - D'Agostino and Pearson test: statistic = {ap_stat:.3f}", ap_p)
+    log_test(f"{name} - Shapiro-Wilk test: statistic = {sw_stat:.3f}", sw_p)
+    log_test(f"{name} - Kolmogorov-Smirnov test: statistic = {ks_stat:.3f}", ks_p)
+    logging.info(f"{name} - Anderson-Darling test: statistic = {ad_stat.statistic:.3f}")
     for i in range(len(ad_stat.critical_values)):
         cv, sl = ad_stat.critical_values[i], ad_stat.significance_level[i]
         logging.info(f"    Critical value at {sl}%: {cv:.3f}")
 
-    return shapiro_p > 0.05 and ks_p > 0.05
+    return sw_p > 0.05 and ks_p > 0.05
 
 
 def plot_boxplot(data_a: np.ndarray, data_b: np.ndarray, labels: list[str], title: str, filename: str):
@@ -104,18 +126,6 @@ def plot_qqplot(data: np.ndarray, title: str, filename: str):
     plt.close()
 
 
-def log_test(msg, p_value):
-    if p_value < 0.01:
-        color, interpretation = "red", "H_0 Rejected"
-    elif 0.01 <= p_value < 0.05:
-        color, interpretation = "orange", "Inconclusive"
-    else:
-        color, interpretation = "green", "H_0 Not Rejected"
-    logging.info(
-        f"{msg}, p-value = {colored(f'{p_value:.3f}', color)} -> {colored(f'{interpretation}', color, attrs=['bold'])}"
-    )
-
-
 def paired_sample_tests(ref_sample: dict, new_sample: dict):
     log_section("Normality tests")
     max_reward_diff = ref_sample["max_rewards"] - new_sample["max_rewards"]
@@ -185,22 +195,22 @@ def independent_sample_tests(ref_sample: dict, new_sample: dict):
         log_test(f"Mann-Whitney U test for Sum Reward: U-statistic = {u_stat_sum_reward:.3f}", p_u_sum_reward)
 
 
-def perform_tests(ref_sample: dict, new_sample: dict, output_dir: Path):
+def perform_tests(ref_sample: dict, new_sample: dict, output_dir: Path, independent: bool = False):
     log_section("Descriptive Stats")
     logging.info(f"Number of episode - Ref Sample: {ref_sample['num_episodes']}")
     logging.info(f"Number of episode - New Sample: {new_sample['num_episodes']}")
 
     seeds_a, seeds_b = ref_sample["seeds"], new_sample["seeds"]
-    if seeds_a == seeds_b:
+    if (seeds_a == seeds_b) and not independent:
         logging.info("Samples are paired (identical seeds).")
         paired = True
     else:
         logging.info("Samples are considered independent (seeds are different).")
         paired = False
 
-    descriptive_stats(ref_sample, new_sample, "successes")
-    descriptive_stats(ref_sample, new_sample, "max_rewards")
-    descriptive_stats(ref_sample, new_sample, "sum_rewards")
+    describe_samples(ref_sample, new_sample, "successes")
+    describe_samples(ref_sample, new_sample, "max_rewards")
+    describe_samples(ref_sample, new_sample, "sum_rewards")
 
     log_section("Effect Size")
     d_max_reward = cohens_d(ref_sample["max_rewards"], new_sample["max_rewards"])
@@ -273,6 +283,11 @@ if __name__ == "__main__":
     )
     parser.add_argument("ref_sample_path", type=Path, help="Path to the reference sample JSON file.")
     parser.add_argument("new_sample_path", type=Path, help="Path to the new sample JSON file.")
+    parser.add_argument(
+        "--independent",
+        action="store_true",
+        help="Ignore seeds and consider samples to be independent (unpaired).",
+    )
     parser.add_argument(
         "--output_dir",
         type=Path,
@@ -280,8 +295,8 @@ if __name__ == "__main__":
         help="Directory to save the output results. Defaults to outputs/compare/",
     )
     args = parser.parse_args()
-    init_logging(args.output_dir)
+    init_logging()
 
     ref_sample = get_eval_info_episodes(args.ref_sample_path)
     new_sample = get_eval_info_episodes(args.new_sample_path)
-    perform_tests(ref_sample, new_sample, args.output_dir)
+    perform_tests(ref_sample, new_sample, args.output_dir, args.independent)