This commit is contained in:
Simon Alibert 2024-05-21 16:31:48 +02:00
parent 62d3546f08
commit c6a61e3ba2
1 changed files with 144 additions and 106 deletions

View File

@ -34,6 +34,7 @@ import scipy.stats as stats
from scipy.stats import anderson, kstest, mannwhitneyu, normaltest, shapiro, ttest_ind, ttest_rel, wilcoxon from scipy.stats import anderson, kstest, mannwhitneyu, normaltest, shapiro, ttest_ind, ttest_rel, wilcoxon
from statsmodels.stats.contingency_tables import mcnemar from statsmodels.stats.contingency_tables import mcnemar
from termcolor import colored from termcolor import colored
from terminaltables import AsciiTable
def init_logging() -> None: def init_logging() -> None:
@ -75,11 +76,19 @@ def get_eval_info_episodes(eval_info_path: Path) -> dict:
} }
def describe_samples(ref_sample: dict, new_sample: dict, metric_name: str): def append_table_metric(table: list, metric: str, ref_sample: dict, new_sample: dict, mean_std: bool = False):
ref_mean, ref_std = np.mean(ref_sample[metric_name]), np.std(ref_sample[metric_name]) if mean_std:
new_mean, new_std = np.mean(new_sample[metric_name]), np.std(new_sample[metric_name]) ref_metric = f"{np.mean(ref_sample[metric]):.3f} ({np.std(ref_sample[metric]):.3f})"
logging.info(f"{metric_name} - Ref sample: mean = {ref_mean:.3f}, std = {ref_std:.3f}") new_metric = f"{np.mean(new_sample[metric]):.3f} ({np.std(new_sample[metric]):.3f})"
logging.info(f"{metric_name} - New sample: mean = {new_mean:.3f}, std = {new_std:.3f}") row_header = f"{metric} - mean (std)"
else:
ref_metric = ref_sample[metric]
new_metric = new_sample[metric]
row_header = metric
row = [row_header, ref_metric, new_metric]
table.append(row)
return table
def cohens_d(x, y): def cohens_d(x, y):
@ -103,114 +112,22 @@ def normality_tests(array: np.ndarray, name: str):
return sw_p > 0.05 and ks_p > 0.05 return sw_p > 0.05 and ks_p > 0.05
def plot_boxplot(data_a: np.ndarray, data_b: np.ndarray, labels: list[str], title: str, filename: str):
plt.boxplot([data_a, data_b], labels=labels)
plt.title(title)
plt.savefig(filename)
plt.close()
def plot_histogram(data_a: np.ndarray, data_b: np.ndarray, labels: list[str], title: str, filename: str):
plt.hist(data_a, bins=30, alpha=0.7, label=labels[0])
plt.hist(data_b, bins=30, alpha=0.7, label=labels[1])
plt.title(title)
plt.legend()
plt.savefig(filename)
plt.close()
def plot_qqplot(data: np.ndarray, title: str, filename: str):
stats.probplot(data, dist="norm", plot=plt)
plt.title(title)
plt.savefig(filename)
plt.close()
def paired_sample_tests(ref_sample: dict, new_sample: dict):
log_section("Normality tests")
max_reward_diff = ref_sample["max_rewards"] - new_sample["max_rewards"]
sum_reward_diff = ref_sample["sum_rewards"] - new_sample["sum_rewards"]
normal_max_reward_diff = normality_tests(max_reward_diff, "Max Reward Difference")
normal_sum_reward_diff = normality_tests(sum_reward_diff, "Sum Reward Difference")
log_section("Paired-sample tests")
if normal_max_reward_diff:
t_stat_max_reward, p_val_max_reward = ttest_rel(ref_sample["max_rewards"], new_sample["max_rewards"])
log_test(f"Paired t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
else:
w_stat_max_reward, p_wilcox_max_reward = wilcoxon(
ref_sample["max_rewards"], new_sample["max_rewards"]
)
log_test(f"Wilcoxon test for Max Reward: statistic = {w_stat_max_reward:.3f}", p_wilcox_max_reward)
if normal_sum_reward_diff:
t_stat_sum_reward, p_val_sum_reward = ttest_rel(ref_sample["sum_rewards"], new_sample["sum_rewards"])
log_test(f"Paired t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
else:
w_stat_sum_reward, p_wilcox_sum_reward = wilcoxon(
ref_sample["sum_rewards"], new_sample["sum_rewards"]
)
log_test(f"Wilcoxon test for Sum Reward: statistic = {w_stat_sum_reward:.3f}", p_wilcox_sum_reward)
table = np.array(
[
[
np.sum((ref_sample["successes"] == 1) & (new_sample["successes"] == 1)),
np.sum((ref_sample["successes"] == 1) & (new_sample["successes"] == 0)),
],
[
np.sum((ref_sample["successes"] == 0) & (new_sample["successes"] == 1)),
np.sum((ref_sample["successes"] == 0) & (new_sample["successes"] == 0)),
],
]
)
mcnemar_result = mcnemar(table, exact=True)
log_test(f"McNemar's test for Success: statistic = {mcnemar_result.statistic:.3f}", mcnemar_result.pvalue)
def independent_sample_tests(ref_sample: dict, new_sample: dict):
log_section("Normality tests")
normal_max_rewards_a = normality_tests(ref_sample["max_rewards"], "Max Rewards Ref Sample")
normal_max_rewards_b = normality_tests(new_sample["max_rewards"], "Max Rewards New Sample")
normal_sum_rewards_a = normality_tests(ref_sample["sum_rewards"], "Sum Rewards Ref Sample")
normal_sum_rewards_b = normality_tests(new_sample["sum_rewards"], "Sum Rewards New Sample")
log_section("Independent samples tests")
if normal_max_rewards_a and normal_max_rewards_b:
t_stat_max_reward, p_val_max_reward = ttest_ind(
ref_sample["max_rewards"], new_sample["max_rewards"], equal_var=False
)
log_test(f"Two-Sample t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
else:
u_stat_max_reward, p_u_max_reward = mannwhitneyu(ref_sample["max_rewards"], new_sample["max_rewards"])
log_test(f"Mann-Whitney U test for Max Reward: U-statistic = {u_stat_max_reward:.3f}", p_u_max_reward)
if normal_sum_rewards_a and normal_sum_rewards_b:
t_stat_sum_reward, p_val_sum_reward = ttest_ind(
ref_sample["sum_rewards"], new_sample["sum_rewards"], equal_var=False
)
log_test(f"Two-Sample t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
else:
u_stat_sum_reward, p_u_sum_reward = mannwhitneyu(ref_sample["sum_rewards"], new_sample["sum_rewards"])
log_test(f"Mann-Whitney U test for Sum Reward: U-statistic = {u_stat_sum_reward:.3f}", p_u_sum_reward)
def perform_tests(ref_sample: dict, new_sample: dict, output_dir: Path, independent: bool = False): def perform_tests(ref_sample: dict, new_sample: dict, output_dir: Path, independent: bool = False):
log_section("Descriptive Stats")
logging.info(f"Number of episode - Ref Sample: {ref_sample['num_episodes']}")
logging.info(f"Number of episode - New Sample: {new_sample['num_episodes']}")
seeds_a, seeds_b = ref_sample["seeds"], new_sample["seeds"] seeds_a, seeds_b = ref_sample["seeds"], new_sample["seeds"]
if (seeds_a == seeds_b) and not independent: if (seeds_a == seeds_b) and not independent:
logging.info("Samples are paired (identical seeds).") logging.info("\nSamples are paired (identical seeds).")
paired = True paired = True
else: else:
logging.info("Samples are considered independent (seeds are different).") logging.info("\nSamples are considered independent (seeds are different).")
paired = False paired = False
describe_samples(ref_sample, new_sample, "successes") table_data = [["Metric", "Ref.", "New"]]
describe_samples(ref_sample, new_sample, "max_rewards") table_data = append_table_metric(table_data, "num_episodes", ref_sample, new_sample)
describe_samples(ref_sample, new_sample, "sum_rewards") table_data = append_table_metric(table_data, "successes", ref_sample, new_sample, mean_std=True)
table_data = append_table_metric(table_data, "max_rewards", ref_sample, new_sample, mean_std=True)
table_data = append_table_metric(table_data, "sum_rewards", ref_sample, new_sample, mean_std=True)
table = AsciiTable(table_data)
print(table.table)
log_section("Effect Size") log_section("Effect Size")
d_max_reward = cohens_d(ref_sample["max_rewards"], new_sample["max_rewards"]) d_max_reward = cohens_d(ref_sample["max_rewards"], new_sample["max_rewards"])
@ -277,6 +194,127 @@ def perform_tests(ref_sample: dict, new_sample: dict, output_dir: Path, independ
) )
def paired_sample_tests(ref_sample: dict, new_sample: dict):
log_section("Normality tests")
max_reward_diff = ref_sample["max_rewards"] - new_sample["max_rewards"]
sum_reward_diff = ref_sample["sum_rewards"] - new_sample["sum_rewards"]
normal_max_reward_diff = normality_tests(max_reward_diff, "Max Reward Difference")
normal_sum_reward_diff = normality_tests(sum_reward_diff, "Sum Reward Difference")
log_section("Paired-sample tests")
if normal_max_reward_diff:
t_stat_max_reward, p_val_max_reward = ttest_rel(ref_sample["max_rewards"], new_sample["max_rewards"])
log_test(f"Paired t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
else:
w_stat_max_reward, p_wilcox_max_reward = wilcoxon(
ref_sample["max_rewards"], new_sample["max_rewards"]
)
log_test(f"Wilcoxon test for Max Reward: statistic = {w_stat_max_reward:.3f}", p_wilcox_max_reward)
if normal_sum_reward_diff:
t_stat_sum_reward, p_val_sum_reward = ttest_rel(ref_sample["sum_rewards"], new_sample["sum_rewards"])
log_test(f"Paired t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
else:
w_stat_sum_reward, p_wilcox_sum_reward = wilcoxon(
ref_sample["sum_rewards"], new_sample["sum_rewards"]
)
log_test(f"Wilcoxon test for Sum Reward: statistic = {w_stat_sum_reward:.3f}", p_wilcox_sum_reward)
table = np.array(
[
[
np.sum((ref_sample["successes"] == 1) & (new_sample["successes"] == 1)),
np.sum((ref_sample["successes"] == 1) & (new_sample["successes"] == 0)),
],
[
np.sum((ref_sample["successes"] == 0) & (new_sample["successes"] == 1)),
np.sum((ref_sample["successes"] == 0) & (new_sample["successes"] == 0)),
],
]
)
mcnemar_result = mcnemar(table, exact=True)
log_test(f"McNemar's test for Success: statistic = {mcnemar_result.statistic:.3f}", mcnemar_result.pvalue)
def independent_sample_tests(ref_sample: dict, new_sample: dict):
log_section("Normality tests")
normal_max_rewards_a = normality_tests(ref_sample["max_rewards"], "Max Rewards Ref Sample")
normal_max_rewards_b = normality_tests(new_sample["max_rewards"], "Max Rewards New Sample")
normal_sum_rewards_a = normality_tests(ref_sample["sum_rewards"], "Sum Rewards Ref Sample")
normal_sum_rewards_b = normality_tests(new_sample["sum_rewards"], "Sum Rewards New Sample")
log_section("Independent samples tests")
table = [["Test", "max_rewards", "sum_rewards"]]
if normal_max_rewards_a and normal_max_rewards_b:
table = append_independent_test(
table, ref_sample, new_sample, ttest_ind, "Two-Sample t-test", kwargs={"equal_var": False}
)
t_stat_max_reward, p_val_max_reward = ttest_ind(
ref_sample["max_rewards"], new_sample["max_rewards"], equal_var=False
)
log_test(f"Two-Sample t-test for Max Reward: t-statistic = {t_stat_max_reward:.3f}", p_val_max_reward)
else:
table = append_independent_test(table, ref_sample, new_sample, mannwhitneyu, "Mann-Whitney U")
u_stat_max_reward, p_u_max_reward = mannwhitneyu(ref_sample["max_rewards"], new_sample["max_rewards"])
log_test(f"Mann-Whitney U test for Max Reward: U-statistic = {u_stat_max_reward:.3f}", p_u_max_reward)
if normal_sum_rewards_a and normal_sum_rewards_b:
t_stat_sum_reward, p_val_sum_reward = ttest_ind(
ref_sample["sum_rewards"], new_sample["sum_rewards"], equal_var=False
)
log_test(f"Two-Sample t-test for Sum Reward: t-statistic = {t_stat_sum_reward:.3f}", p_val_sum_reward)
else:
u_stat_sum_reward, p_u_sum_reward = mannwhitneyu(ref_sample["sum_rewards"], new_sample["sum_rewards"])
log_test(f"Mann-Whitney U test for Sum Reward: U-statistic = {u_stat_sum_reward:.3f}", p_u_sum_reward)
table = AsciiTable(table)
print(table.table)
def append_independent_test(
table: list,
ref_sample: dict,
new_sample: dict,
test: callable,
test_name: str,
kwargs: dict | None = None,
) -> list:
kwargs = {} if kwargs is None else kwargs
row = [f"{test_name}: p-value ≥ alpha"]
for metric in table[0][1:]:
_, p_val = test(ref_sample[metric], new_sample[metric], **kwargs)
alpha = 0.05
status = "" if p_val >= alpha else ""
row.append(f"{status} {p_val:.3f}{alpha}")
table.append(row)
return table
def plot_boxplot(data_a: np.ndarray, data_b: np.ndarray, labels: list[str], title: str, filename: str):
plt.boxplot([data_a, data_b], labels=labels)
plt.title(title)
plt.savefig(filename)
plt.close()
def plot_histogram(data_a: np.ndarray, data_b: np.ndarray, labels: list[str], title: str, filename: str):
plt.hist(data_a, bins=30, alpha=0.7, label=labels[0])
plt.hist(data_b, bins=30, alpha=0.7, label=labels[1])
plt.title(title)
plt.legend()
plt.savefig(filename)
plt.close()
def plot_qqplot(data: np.ndarray, title: str, filename: str):
stats.probplot(data, dist="norm", plot=plt)
plt.title(title)
plt.savefig(filename)
plt.close()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter