from transformers import AutoTokenizer from common import MODEL_TO_NAMES, load_data import matplotlib.pyplot as plt def plot_prob_entropy(acceptance_stats, output_path): acc_probs = [] rej_probs = [] for stat in acceptance_stats: for i, acc_len in enumerate(stat.lens): acc_probs.extend(stat.probs[i][:acc_len-1]) rej_probs.extend(stat.probs[i][acc_len-1:]) fig, ax = plt.subplots(figsize=(12, 8)) plt.hist(acc_probs, bins=100, alpha=0.5, label='Accepted Probabilities', color='green') plt.hist(rej_probs, bins=100, alpha=0.5, label='Rejected Probabilities', color='red') plt.xlabel('Probability') plt.ylabel('Frequency') plt.title('Distribution of Accepted and Rejected Probabilities') plt.legend() plt.tight_layout() plt.savefig(output_path) if __name__ == "__main__": datapath = "/data/lily/sd-benchmark-paper/batch-sd/acceptance_stats.jsonl" model = "llama3.1-8B" tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_NAMES[model], use_fast=False) acceptance_stats = load_data(datapath, tokenizer) plot_prob_entropy(acceptance_stats, output_path="prob_entropy_figures")