From bc3b20f81f91d2fd447d06c9cae8f523983b9767 Mon Sep 17 00:00:00 2001
From: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Date: Sun, 3 Aug 2025 20:06:15 -0700
Subject: [PATCH] accepted length code

---
 benchmarks/run.sh                        |  95 +++++++++++++--
 benchmarks/visualize/common.py           |  63 ++++++++++
 benchmarks/visualize/vis_acc.py          | 143 +++++++++++++++--------
 benchmarks/visualize/vis_acc_diff.py     |   4 +-
 benchmarks/visualize/vis_prob_entropy.py |  38 ++++++
 5 files changed, 286 insertions(+), 57 deletions(-)
 create mode 100644 benchmarks/visualize/common.py
 create mode 100644 benchmarks/visualize/vis_prob_entropy.py

diff --git a/benchmarks/run.sh b/benchmarks/run.sh
index 8b35a237807a7..1deed20d1d26e 100644
--- a/benchmarks/run.sh
+++ b/benchmarks/run.sh
@@ -66,6 +66,15 @@
 #     --speculative_config '{"method": "eagle3", "model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", "num_speculative_tokens": 20}'
 
 
+python benchmarks/benchmark_throughput.py \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct\
+    --dataset-name hf \
+    --dataset-path philschmid/mt-bench  \
+    --prefix-len 0 \
+    --output-len 512 \
+    --num-prompts 200 \
+    --speculative_config '{"method": "eagle3", "model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", "num_speculative_tokens": 20}'
+
 
 # python benchmarks/benchmark_throughput.py \
 #     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
@@ -119,18 +128,86 @@
 #     --dataset-name hf \
 #     --dataset-path AI-MO/aimo-validation-aime \
 #     --prefix-len 0 \
-#     --output-len 5120 \
+#     --output-len 1024 \
 #     --num-prompts 90 \
 #     --speculative_config '{"method": "eagle3", "num_speculative_tokens": 20, "model": "yuhuili/EAGLE3-DeepSeek-R1-Distill-LLaMA-8B"}'
 
 
 
-python benchmarks/benchmark_throughput.py \
-    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-    --dataset-name hf \
-    --dataset-path AI-MO/aimo-validation-aime \
-    --prefix-len 0 \
-    --output-len 5120 \
-    --num-prompts 90 \
-    --speculative_config '{"method": "ngram", "num_speculative_tokens": 20, "prompt_lookup_min": 2, "prompt_lookup_max": 5}'
+# python benchmarks/benchmark_throughput.py \
+#     --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+#     --dataset-name hf \
+#     --dataset-path AI-MO/aimo-validation-aime \
+#     --prefix-len 0 \
+#     --output-len 1024 \
+#     --num-prompts 90 \
+#     --speculative_config '{"method": "ngram", "num_speculative_tokens": 20, "prompt_lookup_min": 2, "prompt_lookup_max": 5}'
 
+
+
+# python benchmarks/benchmark_throughput.py \
+#     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+#     --dataset-name sharegpt \
+#     --dataset-path /data/lily/ShareGPT_V3_unfiltered_cleaned_split.json  \
+#     --prefix-len 0 \
+#     --output-len 512 \
+#     --num-prompts 200 \
+#     --speculative_config '{"method": "ngram", "num_speculative_tokens": 20, "prompt_lookup_min": 2, "prompt_lookup_max": 5}'
+
+
+# python benchmarks/benchmark_throughput.py \
+#     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+#     --dataset-name hf \
+#     --dataset-path philschmid/mt-bench \
+#     --prefix-len 0 \
+#     --output-len 512 \
+#     --num-prompts 200 \
+#     --speculative_config '{"method": "ngram", "num_speculative_tokens": 20, "prompt_lookup_min": 2, "prompt_lookup_max": 5}'
+
+# python benchmarks/benchmark_throughput.py \
+#     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+#     --dataset-name hf \
+#     --dataset-path philschmid/mt-bench \
+#     --prefix-len 0 \
+#     --output-len 512 \
+#     --num-prompts 200 \
+#     --speculative_config '{"method": "eagle", "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", "num_speculative_tokens": 20}'
+
+
+# python benchmarks/benchmark_throughput.py \
+#     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+#     --dataset-name hf \
+#     --dataset-path abisee/cnn_dailymail \
+#     --prefix-len 0 \
+#     --output-len 512 \
+#     --num-prompts 200 \
+#     --speculative_config '{"method": "eagle", "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", "num_speculative_tokens": 20}'
+
+# python benchmarks/benchmark_throughput.py \
+#     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+#     --dataset-name hf \
+#     --dataset-path abisee/cnn_dailymail \
+#     --prefix-len 0 \
+#     --output-len 512 \
+#     --num-prompts 200 \
+#     --speculative_config '{"method": "ngram", "num_speculative_tokens": 20, "prompt_lookup_min": 2, "prompt_lookup_max": 5}'
+
+
+# python benchmarks/benchmark_throughput.py \
+#     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+#     --dataset-name hf \
+#     --dataset-path philschmid/mt-bench \
+#     --prefix-len 0 \
+#     --output-len 512 \
+#     --num-prompts 10 \
+#     --speculative_config '{"method": "eagle3", "model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", "num_speculative_tokens": 20}'
+
+
+# python benchmarks/benchmark_throughput.py \
+#     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+#     --dataset-name hf \
+#     --dataset-path abisee/cnn_dailymail \
+#     --prefix-len 0 \
+#     --output-len 512 \
+#     --num-prompts 200 \
+#     --speculative_config '{"method": "ngram", "num_speculative_tokens": 20, "prompt_lookup_min": 2, "prompt_lookup_max": 5}'
diff --git a/benchmarks/visualize/common.py b/benchmarks/visualize/common.py
new file mode 100644
index 0000000000000..42b3da815b0a2
--- /dev/null
+++ b/benchmarks/visualize/common.py
@@ -0,0 +1,63 @@
+import json
+from dataclasses import dataclass
+
+MODEL_TO_NAMES = {
+    "r1-distill-llama-8B" : "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "llama3-8B" : "meta-llama/Meta-Llama-3-8B-Instruct",
+    "llama3.1-8B" : "meta-llama/Llama-3.1-8B-Instruct",
+    "llama3.1-70B" : "meta-llama/Llama-3.1-70B-Instruct",
+}
+
+@dataclass
+class AccStats:
+    lens: list[int]
+    probs: list[float] = None
+    entropies: list[float] = None
+
+    def __post_init__(self):
+        if self.probs is not None:
+            assert len(self.lens) == len(self.probs), "Length of lens and probs must match"
+        if self.entropies is not None:
+            assert len(self.lens) == len(self.entropies), "Length of lens and entropies must match"
+
+        # remove the prefill accepted lens
+        self.lens = self.lens[1:]
+
+        # remove the last proposed tokens
+        if self.probs:
+            self.probs = self.probs[:-1]
+        if self.entropies:
+            self.entropies = self.entropies[:-1]
+
+    @property
+    def length(self):
+        return len(self.lens)
+
+# def cleanup(acc_stats: AccStats) -> 
+#     # Remove the prefill phase
+#     data = data[1:]
+#     # Cap the maximum value to 10
+#     data = [min(x, 10) for x in data]
+#     return data
+
+def load_data(datapath, tokenizer, verbose=False):
+    acceptance_stats = []
+    with open(datapath, "r") as f:
+        lines = f.readlines()
+        for line in lines:  
+            data = json.loads(line)
+            stat = AccStats(
+                lens=data['acc']['acc_len'],
+                probs=data['acc'].get('acc_prob', None),
+                entropies=data['acc'].get('acc_entropy', None)
+            )
+            acceptance_stats.append(stat)
+            if verbose:
+                print("Input:", tokenizer.decode(data['prompt_token_ids']))
+                print("Output:", tokenizer.decode(data['generated_token_ids']))
+                print("=============================================")
+            
+    max_length = max(stats.length for stats in acceptance_stats)
+        
+    print(f"Load {len(acceptance_stats)} with max length {max_length}")
+    return acceptance_stats
diff --git a/benchmarks/visualize/vis_acc.py b/benchmarks/visualize/vis_acc.py
index d28c13009f961..26146cbb97a91 100644
--- a/benchmarks/visualize/vis_acc.py
+++ b/benchmarks/visualize/vis_acc.py
@@ -2,56 +2,107 @@ import json
 import seaborn as sns
 import matplotlib.pyplot as plt
 from transformers import AutoTokenizer
+from .common import MODEL_TO_NAMES, load_data
+import requests
+import os
+from pathlib import Path
 
+class AcceptanceStatsClient:
+    """Client for fetching and processing acceptance statistics data."""
+    
+    def __init__(self, model_name, method, dataset, data_path=None):
+        """Initialize the client with model and dataset info."""
+        self.model_name = model_name
+        self.method = method
+        self.dataset = dataset
+        
+        if data_path is None:
+            self.data_path = f"/data/lily/batch-sd/data/{model_name}/{method}_{dataset}_acceptance_stats.jsonl"
+        else:
+            self.data_path = data_path
+            
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_NAMES[model_name], use_fast=False)
+        self.acceptance_stats = None
+        
+    def load_data(self):
+        """Load the acceptance statistics from file."""
+        self.acceptance_stats = load_data(self.data_path, self.tokenizer)
+        return self.acceptance_stats
+    
+    def plot_heatmap(self, output_dir="figures"):
+        """Plot the acceptance statistics as a heatmap."""
+        if self.acceptance_stats is None:
+            self.load_data()
+            
+        fig, ax = plt.subplots(figsize=(12, 8))
+        sns.heatmap(self.acceptance_stats, cmap="YlGnBu")
+        plt.xlabel("Position")
+        plt.ylabel("Request ID")
+        
+        # Add Y-axis labels on the right
+        ax2 = ax.twinx()
+        ax2.set_ylim(ax.get_ylim())
+        ax2.set_yticks([])
+        ax2.set_ylabel("# of Accepted Tokens", labelpad=10)
+        
+        plt.title(f"Acceptance Statistics: {self.model_name} - {self.method} - {self.dataset}")
+        plt.tight_layout()
+        
+        # Create output directory if it doesn't exist
+        output_path = Path(output_dir) / self.model_name
+        os.makedirs(output_path, exist_ok=True)
+        
+        output_file = output_path / f"{self.method}_{self.dataset}_acceptance_stats.pdf"
+        plt.savefig(output_file)
+        print(f"Saved heatmap to {output_file}")
+        return fig
+    
+    def get_summary_stats(self):
+        """Get summary statistics about the acceptance data."""
+        if self.acceptance_stats is None:
+            self.load_data()
+            
+        # Calculate average acceptance rate for each position
+        avg_by_position = [sum(col)/len(col) for col in zip(*self.acceptance_stats) if sum(1 for v in col if v >= 0) > 0]
+        
+        # Calculate average acceptance rate for each request
+        avg_by_request = [sum(row)/len(row) for row in self.acceptance_stats]
+        
+        return {
+            "total_requests": len(self.acceptance_stats),
+            "max_position": len(avg_by_position),
+            "avg_acceptance_rate": sum(avg_by_request)/len(avg_by_request),
+            "avg_by_position": avg_by_position,
+            "avg_by_request": avg_by_request
+        }
 
-model = "r1-distill-llama-8B"
-MODEL_TO_NAMES = {
-    "r1-distill-llama-8B" : "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
-}
-method = "ngram"
-dataset = "aime"
-datapath = f"/data/lily/batch-sd/data/{model}/{method}_{dataset}_acceptance_stats.jsonl"
+# Example model configuration
+model = "llama3.1-8B"
+# model = "r1-distill-llama-8B"
+method = "eagle3"
+dataset = "mtbench"
+# dataset = "aime"
+# method = "ngram"
+# dataset = "cnndailymail"
+# datapath = f"/data/lily/batch-sd/data/{model}/{method}_{dataset}_acceptance_stats.jsonl"
+datapath = "acceptance_stats.jsonl"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_NAMES[model], use_fast=False)
 
-def cleanup(data):
-    # Remove the prefill phase
-    data = data[1:]
-    # Cap the maximum value to 10
-    data = [min(x, 10) for x in data]
-    return data
 
-def load_data(datapath):
-    acceptance_stats = []
-    with open(datapath, "r") as f:
-        lines = f.readlines()
-        for line in lines:  
-            data = json.loads(line)
-            acceptance_stats.append(cleanup(data['acc']))
-            print("Input:", tokenizer.decode(data['prompt_token_ids']))
-            print("Output:", tokenizer.decode(data['generated_token_ids']))
-            print("=============================================")
-            
-    # Pad the acceptance stats to the same length
-    max_length = max(len(stats) for stats in acceptance_stats)
-    for i in range(len(acceptance_stats)):
-        acceptance_stats[i] += [-2] * (max_length - len(acceptance_stats[i]))
-        
-    print(f"Load {len(acceptance_stats)} with max length {max_length}")
-    return acceptance_stats
+if __name__ == "__main__":
+    # Use the client instead of directly loading data
+    client = AcceptanceStatsClient(model, method, dataset, datapath)
+    acceptance_stats = client.load_data()
+    
+    # Get summary statistics
+    summary = client.get_summary_stats()
+    print("Summary Statistics:")
+    print(f"Total Requests: {summary['total_requests']}")
+    print(f"Max Position: {summary['max_position']}")
+    print(f"Average Acceptance Rate: {summary['avg_acceptance_rate']:.2f}")
 
-acceptance_stats = load_data(datapath)
+    # Create heatmap visualization
+    plot_heatmap = False
+    if plot_heatmap:
+        client.plot_heatmap()
 
-
-fig, ax = plt.subplots()
-sns.heatmap(acceptance_stats, cmap="YlGnBu")
-plt.xlabel("Position")
-plt.ylabel("Request ID")
-# Add Y-axis labels on the right
-ax2 = ax.twinx()
-ax2.set_ylim(ax.get_ylim())              # Match y-axis range
-ax2.set_yticks([])                       # Remove right tick marks if undesired
-ax2.set_ylabel("# of Accepted Tokens", labelpad=10)         # Set right y-axis label
-
-
-plt.tight_layout()
-plt.savefig(f"figures/{model}/{method}_{dataset}_acceptance_stats.png")
diff --git a/benchmarks/visualize/vis_acc_diff.py b/benchmarks/visualize/vis_acc_diff.py
index 1b45d4ccd44e6..a9044ea226a10 100644
--- a/benchmarks/visualize/vis_acc_diff.py
+++ b/benchmarks/visualize/vis_acc_diff.py
@@ -5,7 +5,7 @@ from matplotlib.colors import LinearSegmentedColormap
 
 model = "llama3.1-8B"
 dataset = "instructcode"
-method1 = "eagle"
+method1 = "ngram"
 method2 = "eagle3"
 
 def get_datapath(method):
@@ -66,4 +66,4 @@ ax2.set_ylabel("# of Accepted Tokens", labelpad=10)         # Set right y-axis l
 plt.title(f"Diff between {method2} - {method1} acceptance stats for {dataset}")
 
 plt.tight_layout()
-plt.savefig(f"figures/{model}/diff_{method2}_{method1}_{dataset}_acceptance_stats.png")
+plt.savefig(f"figures/{model}/diff_{method2}_{method1}_{dataset}_acceptance_stats.pdf")
diff --git a/benchmarks/visualize/vis_prob_entropy.py b/benchmarks/visualize/vis_prob_entropy.py
new file mode 100644
index 0000000000000..70411fce3151b
--- /dev/null
+++ b/benchmarks/visualize/vis_prob_entropy.py
@@ -0,0 +1,38 @@
+from transformers import AutoTokenizer
+from common import MODEL_TO_NAMES, load_data
+import matplotlib.pyplot as plt
+
+
+def plot_prob_entropy(acceptance_stats, 
+                    output_path):
+    
+    acc_probs = []
+    rej_probs = []
+    for stat in acceptance_stats:
+        for i, acc_len in enumerate(stat.lens):
+            acc_probs.extend(stat.probs[i][:acc_len-1])
+            rej_probs.extend(stat.probs[i][acc_len-1:])
+
+    fig, ax = plt.subplots(figsize=(12, 8))
+    plt.hist(acc_probs, bins=100, alpha=0.5, 
+             label='Accepted Probabilities', color='green')
+    plt.hist(rej_probs, bins=100, alpha=0.5, 
+             label='Rejected Probabilities', color='red')
+    plt.xlabel('Probability')
+    plt.ylabel('Frequency')
+    plt.title('Distribution of Accepted and Rejected Probabilities')
+    plt.legend()
+    plt.tight_layout()
+    plt.savefig(output_path)
+
+
+if __name__ == "__main__":
+    datapath = "/data/lily/sd-benchmark-paper/batch-sd/acceptance_stats.jsonl"
+    model = "llama3.1-8B"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_NAMES[model], 
+                                              use_fast=False)
+    acceptance_stats = load_data(datapath, tokenizer)
+    plot_prob_entropy(acceptance_stats, output_path="prob_entropy_figures")
+
+
+