Add argparse, refactor & cleanup
This commit is contained in:
parent
10036c1219
commit
8f5cfcd73d
|
@ -1,3 +1,21 @@
|
||||||
|
"""Compare two policies on based on metrics computed from an eval.
|
||||||
|
|
||||||
|
Usage example:
|
||||||
|
|
||||||
|
You just made changes to a policy and you want to assess its new performance against
|
||||||
|
the reference policy (i.e. before your changes).
|
||||||
|
|
||||||
|
```
|
||||||
|
python lerobot/scripts/compare_policies.py \
|
||||||
|
output/eval/ref_policy/eval_info.json \
|
||||||
|
output/eval/new_policy/eval_info.json
|
||||||
|
```
|
||||||
|
|
||||||
|
This script can accept `eval_info.json` dicts with identical seeds between each eval episode of ref_policy and new_policy
|
||||||
|
(paired-samples) or from evals performed with different seeds (independent samples).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -9,55 +27,49 @@ from scipy.stats import anderson, kstest, mannwhitneyu, shapiro, ttest_ind, ttes
|
||||||
from statsmodels.stats.contingency_tables import mcnemar
|
from statsmodels.stats.contingency_tables import mcnemar
|
||||||
from termcolor import colored
|
from termcolor import colored
|
||||||
|
|
||||||
SAMPLE_A_PATH = Path("outputs/eval/ema.json")
|
|
||||||
SAMPLE_B_PATH = Path("outputs/eval/no_ema.json")
|
def init_logging(output_dir: Path) -> None:
|
||||||
OUTPUT_DIR = Path("outputs/compare/")
|
logging.basicConfig(
|
||||||
LOG_FILE = OUTPUT_DIR / "comparison.log"
|
level=logging.INFO,
|
||||||
|
format="%(message)s",
|
||||||
|
handlers=[logging.StreamHandler()],
|
||||||
|
)
|
||||||
|
logging.getLogger("matplotlib.font_manager").disabled = True
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(
|
def log_section(title: str) -> None:
|
||||||
level=logging.INFO,
|
|
||||||
format="%(message)s",
|
|
||||||
handlers=[logging.FileHandler(LOG_FILE, mode="w"), logging.StreamHandler()],
|
|
||||||
)
|
|
||||||
|
|
||||||
logging.getLogger("matplotlib.font_manager").disabled = True
|
|
||||||
|
|
||||||
|
|
||||||
def log_section(title):
|
|
||||||
section_title = f"\n{'-'*21}\n {title.center(19)} \n{'-'*21}"
|
section_title = f"\n{'-'*21}\n {title.center(19)} \n{'-'*21}"
|
||||||
logging.info(section_title)
|
logging.info(section_title)
|
||||||
|
|
||||||
|
|
||||||
def get_eval_info_episodes(eval_info_path: Path):
|
def get_eval_info_episodes(eval_info_path: Path) -> dict:
|
||||||
with open(eval_info_path) as f:
|
with open(eval_info_path) as f:
|
||||||
eval_info = json.load(f)
|
eval_info = json.load(f)
|
||||||
|
|
||||||
stats = {
|
return {
|
||||||
"sum_rewards": [ep_stat["sum_reward"] for ep_stat in eval_info["per_episode"]],
|
"sum_rewards": np.array([ep_stat["sum_reward"] for ep_stat in eval_info["per_episode"]]),
|
||||||
"max_rewards": [ep_stat["max_reward"] for ep_stat in eval_info["per_episode"]],
|
"max_rewards": np.array([ep_stat["max_reward"] for ep_stat in eval_info["per_episode"]]),
|
||||||
"successes": [ep_stat["success"] for ep_stat in eval_info["per_episode"]],
|
"successes": np.array([ep_stat["success"] for ep_stat in eval_info["per_episode"]]),
|
||||||
"seeds": [ep_stat["seed"] for ep_stat in eval_info["per_episode"]],
|
"seeds": [ep_stat["seed"] for ep_stat in eval_info["per_episode"]],
|
||||||
"num_episodes": len(eval_info["per_episode"]),
|
"num_episodes": len(eval_info["per_episode"]),
|
||||||
}
|
}
|
||||||
return stats
|
|
||||||
|
|
||||||
|
|
||||||
def descriptive_stats(stats_a, stats_b, metric_name):
|
def descriptive_stats(ref_sample: dict, new_sample: dict, metric_name: str):
|
||||||
a_mean, a_std = np.mean(stats_a[metric_name]), np.std(stats_a[metric_name])
|
ref_mean, ref_std = np.mean(ref_sample[metric_name]), np.std(ref_sample[metric_name])
|
||||||
b_mean, b_std = np.mean(stats_b[metric_name]), np.std(stats_b[metric_name])
|
new_mean, new_std = np.mean(new_sample[metric_name]), np.std(new_sample[metric_name])
|
||||||
logging.info(f"{metric_name} - Sample A: Mean = {a_mean:.3f}, Std Dev = {a_std:.3f}")
|
logging.info(f"{metric_name} - Ref sample: mean = {ref_mean:.3f}, std = {ref_std:.3f}")
|
||||||
logging.info(f"{metric_name} - Sample B: Mean = {b_mean:.3f}, Std Dev = {b_std:.3f}")
|
logging.info(f"{metric_name} - New sample: mean = {new_mean:.3f}, std = {new_std:.3f}")
|
||||||
|
|
||||||
|
|
||||||
def cohens_d(x, y):
|
def cohens_d(x, y):
|
||||||
return (np.mean(x) - np.mean(y)) / np.sqrt((np.std(x, ddof=1) ** 2 + np.std(y, ddof=1) ** 2) / 2)
|
return (np.mean(x) - np.mean(y)) / np.sqrt((np.std(x, ddof=1) ** 2 + np.std(y, ddof=1) ** 2) / 2)
|
||||||
|
|
||||||
|
|
||||||
def normality_tests(data, name):
|
def normality_tests(array: np.ndarray, name: str):
|
||||||
shapiro_stat, shapiro_p = shapiro(data)
|
shapiro_stat, shapiro_p = shapiro(array)
|
||||||
ks_stat, ks_p = kstest(data, "norm", args=(np.mean(data), np.std(data)))
|
ks_stat, ks_p = kstest(array, "norm", args=(np.mean(array), np.std(array)))
|
||||||
ad_stat = anderson(data)
|
ad_stat = anderson(array)
|
||||||
|
|
||||||
log_test(f"{name} - Shapiro-Wilk Test: statistic = {shapiro_stat:.3f}", shapiro_p)
|
log_test(f"{name} - Shapiro-Wilk Test: statistic = {shapiro_stat:.3f}", shapiro_p)
|
||||||
log_test(f"{name} - Kolmogorov-Smirnov Test: statistic = {ks_stat:.3f}", ks_p)
|
log_test(f"{name} - Kolmogorov-Smirnov Test: statistic = {ks_stat:.3f}", ks_p)
|
||||||
|
@ -69,14 +81,14 @@ def normality_tests(data, name):
|
||||||
return shapiro_p > 0.05 and ks_p > 0.05
|
return shapiro_p > 0.05 and ks_p > 0.05
|
||||||
|
|
||||||
|
|
||||||
def plot_boxplot(data_a, data_b, labels, title, filename):
|
def plot_boxplot(data_a: np.ndarray, data_b: np.ndarray, labels: list[str], title: str, filename: str):
|
||||||
plt.boxplot([data_a, data_b], labels=labels)
|
plt.boxplot([data_a, data_b], labels=labels)
|
||||||
plt.title(title)
|
plt.title(title)
|
||||||
plt.savefig(filename)
|
plt.savefig(filename)
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
def plot_histogram(data_a, data_b, labels, title, filename):
|
def plot_histogram(data_a: np.ndarray, data_b: np.ndarray, labels: list[str], title: str, filename: str):
|
||||||
plt.hist(data_a, bins=30, alpha=0.7, label=labels[0])
|
plt.hist(data_a, bins=30, alpha=0.7, label=labels[0])
|
||||||
plt.hist(data_b, bins=30, alpha=0.7, label=labels[1])
|
plt.hist(data_b, bins=30, alpha=0.7, label=labels[1])
|
||||||
plt.title(title)
|
plt.title(title)
|
||||||
|
@ -85,7 +97,7 @@ def plot_histogram(data_a, data_b, labels, title, filename):
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
def plot_qqplot(data, title, filename):
|
def plot_qqplot(data: np.ndarray, title: str, filename: str):
|
||||||
stats.probplot(data, dist="norm", plot=plt)
|
stats.probplot(data, dist="norm", plot=plt)
|
||||||
plt.title(title)
|
plt.title(title)
|
||||||
plt.savefig(filename)
|
plt.savefig(filename)
|
||||||
|
@ -97,53 +109,48 @@ def log_test(msg, p_value):
|
||||||
color, interpretation = "red", "H_0 Rejected"
|
color, interpretation = "red", "H_0 Rejected"
|
||||||
elif 0.01 <= p_value < 0.05:
|
elif 0.01 <= p_value < 0.05:
|
||||||
color, interpretation = "orange", "Inconclusive"
|
color, interpretation = "orange", "Inconclusive"
|
||||||
return "warning", "Inconclusive"
|
|
||||||
else:
|
else:
|
||||||
color, interpretation = "green", "H_0 Not Rejected"
|
color, interpretation = "green", "H_0 Not Rejected"
|
||||||
logging.info(
|
logging.info(
|
||||||
msg
|
f"{msg}, p-value = {colored(f'{p_value:.3f}', color)} -> {colored(f'{interpretation}', color, attrs=['bold'])}"
|
||||||
+ ", p-value = "
|
|
||||||
+ colored(f"{p_value:.3f}", color)
|
|
||||||
+ " -> "
|
|
||||||
+ colored(f"{interpretation}", color, attrs=["bold"])
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def perform_paired_tests(stats_a, stats_b):
|
def paired_sample_tests(ref_sample: dict, new_sample: dict):
|
||||||
max_rewards_a, max_rewards_b = np.array(stats_a["max_rewards"]), np.array(stats_b["max_rewards"])
|
|
||||||
sum_rewards_a, sum_rewards_b = np.array(stats_a["sum_rewards"]), np.array(stats_b["sum_rewards"])
|
|
||||||
successes_a, successes_b = np.array(stats_a["successes"]), np.array(stats_b["successes"])
|
|
||||||
max_reward_diff = max_rewards_a - max_rewards_b
|
|
||||||
sum_reward_diff = sum_rewards_a - sum_rewards_b
|
|
||||||
|
|
||||||
log_section("Normality tests")
|
log_section("Normality tests")
|
||||||
|
max_reward_diff = ref_sample["max_rewards"] - new_sample["max_rewards"]
|
||||||
|
sum_reward_diff = ref_sample["sum_rewards"] - new_sample["sum_rewards"]
|
||||||
normal_max_reward_diff = normality_tests(max_reward_diff, "Max Reward Difference")
|
normal_max_reward_diff = normality_tests(max_reward_diff, "Max Reward Difference")
|
||||||
normal_sum_reward_diff = normality_tests(sum_reward_diff, "Sum Reward Difference")
|
normal_sum_reward_diff = normality_tests(sum_reward_diff, "Sum Reward Difference")
|
||||||
|
|
||||||
log_section("Paired-sample tests")
|
log_section("Paired-sample tests")
|
||||||
if normal_max_reward_diff:
|
if normal_max_reward_diff:
|
||||||
t_stat_max_reward, p_val_max_reward = ttest_rel(max_rewards_a, max_rewards_b)
|
t_stat_max_reward, p_val_max_reward = ttest_rel(ref_sample["max_rewards"], new_sample["max_rewards"])
|
||||||
log_test(f"Paired t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
|
log_test(f"Paired t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
|
||||||
else:
|
else:
|
||||||
w_stat_max_reward, p_wilcox_max_reward = wilcoxon(max_rewards_a, max_rewards_b)
|
w_stat_max_reward, p_wilcox_max_reward = wilcoxon(
|
||||||
|
ref_sample["max_rewards"], new_sample["max_rewards"]
|
||||||
|
)
|
||||||
log_test(f"Wilcoxon test for Max Reward: statistic = {w_stat_max_reward:.3f}", p_wilcox_max_reward)
|
log_test(f"Wilcoxon test for Max Reward: statistic = {w_stat_max_reward:.3f}", p_wilcox_max_reward)
|
||||||
|
|
||||||
if normal_sum_reward_diff:
|
if normal_sum_reward_diff:
|
||||||
t_stat_sum_reward, p_val_sum_reward = ttest_rel(sum_rewards_a, sum_rewards_b)
|
t_stat_sum_reward, p_val_sum_reward = ttest_rel(ref_sample["sum_rewards"], new_sample["sum_rewards"])
|
||||||
log_test(f"Paired t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
|
log_test(f"Paired t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
|
||||||
else:
|
else:
|
||||||
w_stat_sum_reward, p_wilcox_sum_reward = wilcoxon(sum_rewards_a, sum_rewards_b)
|
w_stat_sum_reward, p_wilcox_sum_reward = wilcoxon(
|
||||||
|
ref_sample["sum_rewards"], new_sample["sum_rewards"]
|
||||||
|
)
|
||||||
log_test(f"Wilcoxon test for Sum Reward: statistic = {w_stat_sum_reward:.3f}", p_wilcox_sum_reward)
|
log_test(f"Wilcoxon test for Sum Reward: statistic = {w_stat_sum_reward:.3f}", p_wilcox_sum_reward)
|
||||||
|
|
||||||
table = np.array(
|
table = np.array(
|
||||||
[
|
[
|
||||||
[
|
[
|
||||||
np.sum((successes_a == 1) & (successes_b == 1)),
|
np.sum((ref_sample["successes"] == 1) & (new_sample["successes"] == 1)),
|
||||||
np.sum((successes_a == 1) & (successes_b == 0)),
|
np.sum((ref_sample["successes"] == 1) & (new_sample["successes"] == 0)),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
np.sum((successes_a == 0) & (successes_b == 1)),
|
np.sum((ref_sample["successes"] == 0) & (new_sample["successes"] == 1)),
|
||||||
np.sum((successes_a == 0) & (successes_b == 0)),
|
np.sum((ref_sample["successes"] == 0) & (new_sample["successes"] == 0)),
|
||||||
],
|
],
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
@ -151,116 +158,130 @@ def perform_paired_tests(stats_a, stats_b):
|
||||||
log_test(f"McNemar's test for Success: statistic = {mcnemar_result.statistic:.3f}", mcnemar_result.pvalue)
|
log_test(f"McNemar's test for Success: statistic = {mcnemar_result.statistic:.3f}", mcnemar_result.pvalue)
|
||||||
|
|
||||||
|
|
||||||
def perform_independent_tests(stats_a, stats_b):
|
def independent_sample_tests(ref_sample: dict, new_sample: dict):
|
||||||
max_rewards_a, max_rewards_b = np.array(stats_a["max_rewards"]), np.array(stats_b["max_rewards"])
|
|
||||||
sum_rewards_a, sum_rewards_b = np.array(stats_a["sum_rewards"]), np.array(stats_b["sum_rewards"])
|
|
||||||
|
|
||||||
log_section("Normality tests")
|
log_section("Normality tests")
|
||||||
normal_max_rewards_a = normality_tests(max_rewards_a, "Max Rewards Sample A")
|
normal_max_rewards_a = normality_tests(ref_sample["max_rewards"], "Max Rewards Ref Sample")
|
||||||
normal_max_rewards_b = normality_tests(max_rewards_b, "Max Rewards Sample B")
|
normal_max_rewards_b = normality_tests(new_sample["max_rewards"], "Max Rewards New Sample")
|
||||||
normal_sum_rewards_a = normality_tests(sum_rewards_a, "Sum Rewards Sample A")
|
normal_sum_rewards_a = normality_tests(ref_sample["sum_rewards"], "Sum Rewards Ref Sample")
|
||||||
normal_sum_rewards_b = normality_tests(sum_rewards_b, "Sum Rewards Sample B")
|
normal_sum_rewards_b = normality_tests(new_sample["sum_rewards"], "Sum Rewards New Sample")
|
||||||
|
|
||||||
log_section("Independent samples tests")
|
log_section("Independent samples tests")
|
||||||
if normal_max_rewards_a and normal_max_rewards_b:
|
if normal_max_rewards_a and normal_max_rewards_b:
|
||||||
t_stat_max_reward, p_val_max_reward = ttest_ind(max_rewards_a, max_rewards_b, equal_var=False)
|
t_stat_max_reward, p_val_max_reward = ttest_ind(
|
||||||
|
ref_sample["max_rewards"], new_sample["max_rewards"], equal_var=False
|
||||||
|
)
|
||||||
log_test(f"Two-Sample t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
|
log_test(f"Two-Sample t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
|
||||||
else:
|
else:
|
||||||
u_stat_max_reward, p_u_max_reward = mannwhitneyu(max_rewards_a, max_rewards_b)
|
u_stat_max_reward, p_u_max_reward = mannwhitneyu(ref_sample["max_rewards"], new_sample["max_rewards"])
|
||||||
log_test(f"Mann-Whitney U test for Max Reward: U-statistic = {u_stat_max_reward:.3f}", p_u_max_reward)
|
log_test(f"Mann-Whitney U test for Max Reward: U-statistic = {u_stat_max_reward:.3f}", p_u_max_reward)
|
||||||
|
|
||||||
if normal_sum_rewards_a and normal_sum_rewards_b:
|
if normal_sum_rewards_a and normal_sum_rewards_b:
|
||||||
t_stat_sum_reward, p_val_sum_reward = ttest_ind(sum_rewards_a, sum_rewards_b, equal_var=False)
|
t_stat_sum_reward, p_val_sum_reward = ttest_ind(
|
||||||
|
ref_sample["sum_rewards"], new_sample["sum_rewards"], equal_var=False
|
||||||
|
)
|
||||||
log_test(f"Two-Sample t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
|
log_test(f"Two-Sample t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
|
||||||
else:
|
else:
|
||||||
u_stat_sum_reward, p_u_sum_reward = mannwhitneyu(sum_rewards_a, sum_rewards_b)
|
u_stat_sum_reward, p_u_sum_reward = mannwhitneyu(ref_sample["sum_rewards"], new_sample["sum_rewards"])
|
||||||
log_test(f"Mann-Whitney U test for Sum Reward: U-statistic = {u_stat_sum_reward:.3f}", p_u_sum_reward)
|
log_test(f"Mann-Whitney U test for Sum Reward: U-statistic = {u_stat_sum_reward:.3f}", p_u_sum_reward)
|
||||||
|
|
||||||
|
|
||||||
def perform_tests(sample_a_stats, sample_b_stats):
|
def perform_tests(ref_sample: dict, new_sample: dict, output_dir: Path):
|
||||||
log_section("Descriptive Stats")
|
log_section("Descriptive Stats")
|
||||||
logging.info(f"Number of episode - sample A: {sample_a_stats['num_episodes']}")
|
logging.info(f"Number of episode - Ref Sample: {ref_sample['num_episodes']}")
|
||||||
logging.info(f"Number of episode - sample B: {sample_b_stats['num_episodes']}")
|
logging.info(f"Number of episode - New Sample: {new_sample['num_episodes']}")
|
||||||
|
|
||||||
seeds_a, seeds_b = sample_a_stats["seeds"], sample_b_stats["seeds"]
|
seeds_a, seeds_b = ref_sample["seeds"], new_sample["seeds"]
|
||||||
if seeds_a == seeds_b:
|
if seeds_a == seeds_b:
|
||||||
logging.info("Samples are paired (identical seeds).")
|
logging.info("Samples are paired (identical seeds).")
|
||||||
paired = True
|
paired = True
|
||||||
else:
|
else:
|
||||||
logging.info("Samples are considered independant (seeds are different).")
|
logging.info("Samples are considered independent (seeds are different).")
|
||||||
paired = False
|
paired = False
|
||||||
|
|
||||||
descriptive_stats(sample_a_stats, sample_b_stats, "successes")
|
descriptive_stats(ref_sample, new_sample, "successes")
|
||||||
descriptive_stats(sample_a_stats, sample_b_stats, "max_rewards")
|
descriptive_stats(ref_sample, new_sample, "max_rewards")
|
||||||
descriptive_stats(sample_a_stats, sample_b_stats, "sum_rewards")
|
descriptive_stats(ref_sample, new_sample, "sum_rewards")
|
||||||
|
|
||||||
log_section("Effect Size")
|
log_section("Effect Size")
|
||||||
d_max_reward = cohens_d(sample_a_stats["max_rewards"], sample_b_stats["max_rewards"])
|
d_max_reward = cohens_d(ref_sample["max_rewards"], new_sample["max_rewards"])
|
||||||
d_sum_reward = cohens_d(sample_a_stats["sum_rewards"], sample_b_stats["sum_rewards"])
|
d_sum_reward = cohens_d(ref_sample["sum_rewards"], new_sample["sum_rewards"])
|
||||||
logging.info(f"Cohen's d for Max Reward: {d_max_reward:.3f}")
|
logging.info(f"Cohen's d for Max Reward: {d_max_reward:.3f}")
|
||||||
logging.info(f"Cohen's d for Sum Reward: {d_sum_reward:.3f}")
|
logging.info(f"Cohen's d for Sum Reward: {d_sum_reward:.3f}")
|
||||||
|
|
||||||
if paired:
|
if paired:
|
||||||
perform_paired_tests(sample_a_stats, sample_b_stats)
|
paired_sample_tests(ref_sample, new_sample)
|
||||||
else:
|
else:
|
||||||
perform_independent_tests(sample_a_stats, sample_b_stats)
|
independent_sample_tests(ref_sample, new_sample)
|
||||||
|
|
||||||
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
|
output_dir.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
plot_boxplot(
|
plot_boxplot(
|
||||||
sample_a_stats["max_rewards"],
|
ref_sample["max_rewards"],
|
||||||
sample_b_stats["max_rewards"],
|
new_sample["max_rewards"],
|
||||||
["Sample A Max Reward", "Sample B Max Reward"],
|
["Ref Sample Max Reward", "New Sample Max Reward"],
|
||||||
"Boxplot of Max Rewards",
|
"Boxplot of Max Rewards",
|
||||||
f"{OUTPUT_DIR}/boxplot_max_reward.png",
|
f"{output_dir}/boxplot_max_reward.png",
|
||||||
)
|
)
|
||||||
plot_boxplot(
|
plot_boxplot(
|
||||||
sample_a_stats["sum_rewards"],
|
ref_sample["sum_rewards"],
|
||||||
sample_b_stats["sum_rewards"],
|
new_sample["sum_rewards"],
|
||||||
["Sample A Sum Reward", "Sample B Sum Reward"],
|
["Ref Sample Sum Reward", "New Sample Sum Reward"],
|
||||||
"Boxplot of Sum Rewards",
|
"Boxplot of Sum Rewards",
|
||||||
f"{OUTPUT_DIR}/boxplot_sum_reward.png",
|
f"{output_dir}/boxplot_sum_reward.png",
|
||||||
)
|
)
|
||||||
|
|
||||||
plot_histogram(
|
plot_histogram(
|
||||||
sample_a_stats["max_rewards"],
|
ref_sample["max_rewards"],
|
||||||
sample_b_stats["max_rewards"],
|
new_sample["max_rewards"],
|
||||||
["Sample A Max Reward", "Sample B Max Reward"],
|
["Ref Sample Max Reward", "New Sample Max Reward"],
|
||||||
"Histogram of Max Rewards",
|
"Histogram of Max Rewards",
|
||||||
f"{OUTPUT_DIR}/histogram_max_reward.png",
|
f"{output_dir}/histogram_max_reward.png",
|
||||||
)
|
)
|
||||||
plot_histogram(
|
plot_histogram(
|
||||||
sample_a_stats["sum_rewards"],
|
ref_sample["sum_rewards"],
|
||||||
sample_b_stats["sum_rewards"],
|
new_sample["sum_rewards"],
|
||||||
["Sample A Sum Reward", "Sample B Sum Reward"],
|
["Ref Sample Sum Reward", "New Sample Sum Reward"],
|
||||||
"Histogram of Sum Rewards",
|
"Histogram of Sum Rewards",
|
||||||
f"{OUTPUT_DIR}/histogram_sum_reward.png",
|
f"{output_dir}/histogram_sum_reward.png",
|
||||||
)
|
)
|
||||||
|
|
||||||
plot_qqplot(
|
plot_qqplot(
|
||||||
sample_a_stats["max_rewards"],
|
ref_sample["max_rewards"],
|
||||||
"Q-Q Plot of Sample A Max Rewards",
|
"Q-Q Plot of Ref Sample Max Rewards",
|
||||||
f"{OUTPUT_DIR}/qqplot_sample_a_max_reward.png",
|
f"{output_dir}/qqplot_sample_a_max_reward.png",
|
||||||
)
|
)
|
||||||
plot_qqplot(
|
plot_qqplot(
|
||||||
sample_b_stats["max_rewards"],
|
new_sample["max_rewards"],
|
||||||
"Q-Q Plot of Sample B Max Rewards",
|
"Q-Q Plot of New Sample Max Rewards",
|
||||||
f"{OUTPUT_DIR}/qqplot_sample_b_max_reward.png",
|
f"{output_dir}/qqplot_sample_b_max_reward.png",
|
||||||
)
|
)
|
||||||
plot_qqplot(
|
plot_qqplot(
|
||||||
sample_a_stats["sum_rewards"],
|
ref_sample["sum_rewards"],
|
||||||
"Q-Q Plot of Sample A Sum Rewards",
|
"Q-Q Plot of Ref Sample Sum Rewards",
|
||||||
f"{OUTPUT_DIR}/qqplot_sample_a_sum_reward.png",
|
f"{output_dir}/qqplot_sample_a_sum_reward.png",
|
||||||
)
|
)
|
||||||
plot_qqplot(
|
plot_qqplot(
|
||||||
sample_b_stats["sum_rewards"],
|
new_sample["sum_rewards"],
|
||||||
"Q-Q Plot of Sample B Sum Rewards",
|
"Q-Q Plot of New Sample Sum Rewards",
|
||||||
f"{OUTPUT_DIR}/qqplot_sample_b_sum_reward.png",
|
f"{output_dir}/qqplot_sample_b_sum_reward.png",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sample_a_stats = get_eval_info_episodes(SAMPLE_A_PATH)
|
parser = argparse.ArgumentParser(
|
||||||
sample_b_stats = get_eval_info_episodes(SAMPLE_B_PATH)
|
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
||||||
|
)
|
||||||
|
parser.add_argument("ref_sample_path", type=Path, help="Path to the reference sample JSON file.")
|
||||||
|
parser.add_argument("new_sample_path", type=Path, help="Path to the new sample JSON file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output_dir",
|
||||||
|
type=Path,
|
||||||
|
default=Path("outputs/compare/"),
|
||||||
|
help="Directory to save the output results. Defaults to outputs/compare/",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
init_logging(args.output_dir)
|
||||||
|
|
||||||
perform_tests(sample_a_stats, sample_b_stats)
|
ref_sample = get_eval_info_episodes(args.ref_sample_path)
|
||||||
|
new_sample = get_eval_info_episodes(args.new_sample_path)
|
||||||
|
perform_tests(ref_sample, new_sample, args.output_dir)
|
||||||
|
|
Loading…
Reference in New Issue