mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 12:17:05 +08:00
more changes Apply suggestions from code review Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> fix tp Signed-off-by: Lu Fang <fanglu@fb.com> add comparison tool tmp add unit test and fix format Signed-off-by: Lu Fang <fanglu@fb.com> add comparison script and documentation Signed-off-by: Lu Fang <fanglu@fb.com> provide default intermediate logging Signed-off-by: Lu Fang <fanglu@fb.com> optional register il Signed-off-by: Lu Fang <fanglu@fb.com> add input reload and improve intermediate compare
707 lines
27 KiB
Python
Executable File
707 lines
27 KiB
Python
Executable File
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""
|
|
Script to compare intermediate logging outputs from two different runs.
|
|
|
|
This script compares the tensor outputs from two different intermediate logging
|
|
directories and generates a report of the differences.
|
|
|
|
Usage:
|
|
python compare_intermediate.py --dir1 /path/to/first/log/dir --dir2 /path/to/second/log/dir [options]
|
|
|
|
Options:
|
|
--dir1 DIR First intermediate logging directory
|
|
--dir2 DIR Second intermediate logging directory
|
|
--output FILE Output file for the report (default: stdout)
|
|
--format {md,json} Output format (default: md)
|
|
--rtol FLOAT Relative tolerance for tensor comparison (default: 1e-5)
|
|
--atol FLOAT Absolute tolerance for tensor comparison (default: 1e-8)
|
|
--steps STEPS Comma-separated list of steps to compare (default: all)
|
|
--modules MODULES Comma-separated list of module name patterns to compare (default: all)
|
|
--verbose Include detailed information about each tensor
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import torch
|
|
|
|
|
|
def load_tensor(path: Path) -> torch.Tensor:
|
|
"""Load a tensor from a .pt file."""
|
|
try:
|
|
return torch.load(path, map_location="cpu")
|
|
except Exception as e:
|
|
print(f"Error loading tensor from {path}: {e}")
|
|
return None
|
|
|
|
|
|
def load_json(path: Path) -> Dict:
|
|
"""Load a JSON file."""
|
|
try:
|
|
with open(path, "r") as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
print(f"Error loading JSON from {path}: {e}")
|
|
return {}
|
|
|
|
|
|
def extract_diff_metatada(exception_str: str) -> Dict:
|
|
try:
|
|
num_diff_elements = int(
|
|
re.search(r"Mismatched elements: (\d+) /", exception_str).group(1)
|
|
)
|
|
total_elements = int(
|
|
re.search(r"Mismatched elements: \d+ / (\d+)", exception_str).group(1)
|
|
)
|
|
max_abs_diff = float(
|
|
re.search(
|
|
r"Greatest absolute difference: ([\d\.e-]+)", exception_str
|
|
).group(1)
|
|
)
|
|
max_rel_diff = float(
|
|
re.search(
|
|
r"Greatest relative difference: ([\d\.e-]+)", exception_str
|
|
).group(1)
|
|
)
|
|
return {
|
|
"num_diff_elements": num_diff_elements,
|
|
"total_elements": total_elements,
|
|
"max_abs_diff": max_abs_diff,
|
|
"max_rel_diff": max_rel_diff,
|
|
}
|
|
except Exception:
|
|
return {"error": exception_str}
|
|
|
|
|
|
def compare_tensors(
|
|
tensor1: torch.Tensor, tensor2: torch.Tensor, rtol: float, atol: float
|
|
) -> Dict:
|
|
"""Compare two tensors and return a dictionary with comparison results."""
|
|
if tensor1 is None or tensor2 is None:
|
|
return {"match": False, "error": "One or both tensors are None"}
|
|
|
|
if tensor1.shape != tensor2.shape:
|
|
return {
|
|
"match": False,
|
|
"error": f"Shape mismatch: {tensor1.shape} vs {tensor2.shape}",
|
|
}
|
|
|
|
if tensor1.dtype != tensor2.dtype:
|
|
return {
|
|
"match": False,
|
|
"error": f"Dtype mismatch: {tensor1.dtype} vs {tensor2.dtype}",
|
|
}
|
|
|
|
# Check if tensors are close using PyTorch's assert_close
|
|
try:
|
|
torch.testing.assert_close(tensor1, tensor2, rtol=rtol, atol=atol)
|
|
except Exception as e:
|
|
return {"match": False, **extract_diff_metatada(str(e))}
|
|
return {"match": True}
|
|
|
|
|
|
def compare_json_values(value1: Any, value2: Any) -> Dict:
|
|
"""Compare two JSON values and return a dictionary with comparison results."""
|
|
if type(value1) is not type(value2):
|
|
return {
|
|
"match": False,
|
|
"error": f"Type mismatch: {type(value1).__name__} vs {type(value2).__name__}",
|
|
}
|
|
|
|
if isinstance(value1, dict):
|
|
# Compare dictionaries
|
|
all_keys = set(value1.keys()) | set(value2.keys())
|
|
mismatches = {}
|
|
|
|
for key in all_keys:
|
|
if key not in value1:
|
|
mismatches[key] = {"error": "Missing in first dict"}
|
|
elif key not in value2:
|
|
mismatches[key] = {"error": "Missing in second dict"}
|
|
else:
|
|
comparison = compare_json_values(value1[key], value2[key])
|
|
if not comparison["match"]:
|
|
mismatches[key] = comparison
|
|
|
|
if mismatches:
|
|
return {"match": False, "mismatches": mismatches}
|
|
return {"match": True}
|
|
|
|
elif isinstance(value1, list):
|
|
# Compare lists
|
|
if len(value1) != len(value2):
|
|
return {
|
|
"match": False,
|
|
"error": f"Length mismatch: {len(value1)} vs {len(value2)}",
|
|
}
|
|
|
|
mismatches = {}
|
|
for i, (item1, item2) in enumerate(zip(value1, value2)):
|
|
comparison = compare_json_values(item1, item2)
|
|
if not comparison["match"]:
|
|
mismatches[i] = comparison
|
|
|
|
if mismatches:
|
|
return {"match": False, "mismatches": mismatches}
|
|
return {"match": True}
|
|
|
|
else:
|
|
# Compare primitive values
|
|
if value1 == value2:
|
|
return {"match": True}
|
|
else:
|
|
return {"match": False, "value1": value1, "value2": value2}
|
|
|
|
|
|
def find_tensor_files(directory: Path) -> Dict[str, Dict[str, Dict[str, List[Path]]]]:
|
|
"""
|
|
Find all tensor files in the given directory.
|
|
|
|
Returns a dictionary with the structure:
|
|
{
|
|
"step_0": {
|
|
"module_name_123456": {
|
|
"inputs": [Path("inputs_0_cuda_0.pt"), ...],
|
|
"outputs": [Path("output_cuda_0.pt"), ...]
|
|
},
|
|
...
|
|
},
|
|
...
|
|
}
|
|
"""
|
|
result = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
|
|
|
# Find all step directories
|
|
step_dirs = [d for d in directory.glob("step_*") if d.is_dir()]
|
|
|
|
for step_dir in step_dirs:
|
|
step_name = step_dir.name
|
|
|
|
# Find all module directories
|
|
module_dirs = [d for d in step_dir.glob("*") if d.is_dir()]
|
|
|
|
for module_dir in module_dirs:
|
|
module_name = module_dir.name
|
|
|
|
# Find input tensor files
|
|
input_tensors = list(module_dir.glob("inputs_*.pt"))
|
|
if input_tensors:
|
|
result[step_name][module_name]["inputs"] = input_tensors
|
|
|
|
# Find output tensor files
|
|
output_tensors = list(module_dir.glob("output*.pt"))
|
|
if output_tensors:
|
|
result[step_name][module_name]["outputs"] = output_tensors
|
|
|
|
# Find JSON metadata files
|
|
inputs_json = module_dir / "inputs.json"
|
|
if inputs_json.exists():
|
|
result[step_name][module_name]["inputs_json"] = [inputs_json]
|
|
|
|
outputs_json = module_dir / "outputs.json"
|
|
if outputs_json.exists():
|
|
result[step_name][module_name]["outputs_json"] = [outputs_json]
|
|
|
|
return result
|
|
|
|
|
|
def filter_steps_and_modules(
|
|
tensor_files: Dict[str, Dict[str, Dict[str, List[Path]]]],
|
|
steps: Optional[List[str]] = None,
|
|
module_patterns: Optional[List[str]] = None,
|
|
) -> Dict[str, Dict[str, Dict[str, List[Path]]]]:
|
|
"""Filter tensor files by steps and module patterns."""
|
|
result = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
|
|
|
# Filter steps
|
|
if steps:
|
|
step_names = [f"step_{step}" for step in steps]
|
|
steps_to_include = {step: True for step in step_names}
|
|
else:
|
|
steps_to_include = {step: True for step in tensor_files.keys()}
|
|
|
|
# Compile module patterns
|
|
if module_patterns:
|
|
compiled_patterns = [re.compile(pattern) for pattern in module_patterns]
|
|
else:
|
|
compiled_patterns = None
|
|
|
|
for step_name, modules in tensor_files.items():
|
|
if step_name not in steps_to_include:
|
|
continue
|
|
|
|
for module_name, file_types in modules.items():
|
|
# Check if module matches any pattern
|
|
if compiled_patterns:
|
|
if not any(
|
|
pattern.search(module_name) for pattern in compiled_patterns
|
|
):
|
|
continue
|
|
|
|
result[step_name][module_name] = file_types
|
|
|
|
return result
|
|
|
|
|
|
def compare_directories(
|
|
dir1: Path,
|
|
dir2: Path,
|
|
rtol: Optional[float] = None,
|
|
atol: Optional[float] = None,
|
|
steps: Optional[List[str]] = None,
|
|
module_patterns: Optional[List[str]] = None,
|
|
) -> Dict:
|
|
"""Compare two intermediate logging directories and return a report."""
|
|
# Find tensor files in both directories
|
|
tensor_files1 = find_tensor_files(dir1)
|
|
tensor_files2 = find_tensor_files(dir2)
|
|
|
|
# Filter by steps and modules
|
|
if steps or module_patterns:
|
|
tensor_files1 = filter_steps_and_modules(tensor_files1, steps, module_patterns)
|
|
tensor_files2 = filter_steps_and_modules(tensor_files2, steps, module_patterns)
|
|
|
|
# Get all steps and modules from both directories
|
|
all_steps = set(tensor_files1.keys()) | set(tensor_files2.keys())
|
|
|
|
report = {
|
|
"dir1": str(dir1),
|
|
"dir2": str(dir2),
|
|
"rtol": rtol,
|
|
"atol": atol,
|
|
"steps": {},
|
|
}
|
|
|
|
# Compare each step
|
|
for step in sorted(all_steps):
|
|
step_report = {
|
|
"modules": {},
|
|
"summary": {
|
|
"total_modules": 0,
|
|
"matching_modules": 0,
|
|
"mismatched_modules": 0,
|
|
"missing_modules": 0,
|
|
},
|
|
}
|
|
|
|
# Get all modules from both directories for this step
|
|
modules1 = tensor_files1.get(step, {})
|
|
modules2 = tensor_files2.get(step, {})
|
|
# TODO: read from module calls.txt to get the full module list
|
|
# TODO: check if module calls txt exsits
|
|
dir1_module_call_file = dir1 / step / "module_calls.txt"
|
|
if dir1_module_call_file.exists():
|
|
with open(dir1 / step / "module_calls.txt", "r") as f:
|
|
all_modules = f.read().splitlines()
|
|
else:
|
|
print(
|
|
"Warnings: the module call orders are missed, ordering using module alphbetics"
|
|
)
|
|
all_modules = sorted(set(modules1.keys()) | set(modules2.keys()))
|
|
step_report["module_call_list"] = []
|
|
for module in all_modules:
|
|
module_report = {
|
|
"inputs": {},
|
|
"outputs": {},
|
|
"summary": {
|
|
"total_tensors": 0,
|
|
"matching_tensors": 0,
|
|
"mismatched_tensors": 0,
|
|
"missing_tensors": 0,
|
|
},
|
|
}
|
|
|
|
# Check if module exists in both directories
|
|
if module not in modules1:
|
|
module_report["error"] = f"Module missing in {dir1}"
|
|
step_report["summary"]["missing_modules"] += 1
|
|
step_report["modules"][module] = module_report
|
|
continue
|
|
|
|
if module not in modules2:
|
|
module_report["error"] = f"Module missing in {dir2}"
|
|
step_report["summary"]["missing_modules"] += 1
|
|
step_report["modules"][module] = module_report
|
|
continue
|
|
|
|
# Compare JSON metadata
|
|
for json_type in ["inputs_json", "outputs_json"]:
|
|
json_files1 = modules1[module].get(json_type, [])
|
|
json_files2 = modules2[module].get(json_type, [])
|
|
|
|
if json_files1 and json_files2:
|
|
json1 = load_json(json_files1[0])
|
|
json2 = load_json(json_files2[0])
|
|
|
|
json_comparison = compare_json_values(json1, json2)
|
|
json_name = json_type.replace("_json", "")
|
|
module_report[f"{json_name}_metadata"] = json_comparison
|
|
|
|
# Add file paths for manual checking when there's a mismatch
|
|
if not json_comparison.get("match", True):
|
|
module_report[f"{json_name}_metadata"]["file1"] = str(
|
|
json_files1[0]
|
|
)
|
|
module_report[f"{json_name}_metadata"]["file2"] = str(
|
|
json_files2[0]
|
|
)
|
|
|
|
# Compare input tensors
|
|
input_tensors1 = {p.name: p for p in modules1[module].get("inputs", [])}
|
|
input_tensors2 = {p.name: p for p in modules2[module].get("inputs", [])}
|
|
all_input_names = set(input_tensors1.keys()) | set(input_tensors2.keys())
|
|
|
|
for tensor_name in sorted(all_input_names):
|
|
if tensor_name not in input_tensors1:
|
|
module_report["inputs"][tensor_name] = {
|
|
"match": False,
|
|
"error": f"Tensor missing in {dir1}",
|
|
}
|
|
module_report["summary"]["missing_tensors"] += 1
|
|
elif tensor_name not in input_tensors2:
|
|
module_report["inputs"][tensor_name] = {
|
|
"match": False,
|
|
"error": f"Tensor missing in {dir2}",
|
|
}
|
|
module_report["summary"]["missing_tensors"] += 1
|
|
else:
|
|
tensor1 = load_tensor(input_tensors1[tensor_name])
|
|
tensor2 = load_tensor(input_tensors2[tensor_name])
|
|
|
|
comparison = compare_tensors(tensor1, tensor2, rtol, atol)
|
|
# Add file paths for manual checking when there's a mismatch
|
|
if not comparison.get("match", False):
|
|
comparison["file1"] = str(input_tensors1[tensor_name])
|
|
comparison["file2"] = str(input_tensors2[tensor_name])
|
|
|
|
module_report["inputs"][tensor_name] = comparison
|
|
|
|
if comparison.get("match", False):
|
|
module_report["summary"]["matching_tensors"] += 1
|
|
else:
|
|
module_report["summary"]["mismatched_tensors"] += 1
|
|
|
|
module_report["summary"]["total_tensors"] += 1
|
|
|
|
# Compare output tensors
|
|
output_tensors1 = {p.name: p for p in modules1[module].get("outputs", [])}
|
|
output_tensors2 = {p.name: p for p in modules2[module].get("outputs", [])}
|
|
all_output_names = set(output_tensors1.keys()) | set(output_tensors2.keys())
|
|
|
|
for tensor_name in sorted(all_output_names):
|
|
if tensor_name not in output_tensors1:
|
|
module_report["outputs"][tensor_name] = {
|
|
"match": False,
|
|
"error": f"Tensor missing in {dir1}",
|
|
}
|
|
module_report["summary"]["missing_tensors"] += 1
|
|
elif tensor_name not in output_tensors2:
|
|
module_report["outputs"][tensor_name] = {
|
|
"match": False,
|
|
"error": f"Tensor missing in {dir2}",
|
|
}
|
|
module_report["summary"]["missing_tensors"] += 1
|
|
else:
|
|
tensor1 = load_tensor(output_tensors1[tensor_name])
|
|
tensor2 = load_tensor(output_tensors2[tensor_name])
|
|
|
|
comparison = compare_tensors(tensor1, tensor2, rtol, atol)
|
|
# Add file paths for manual checking when there's a mismatch
|
|
if not comparison.get("match", False):
|
|
comparison["file1"] = str(output_tensors1[tensor_name])
|
|
comparison["file2"] = str(output_tensors2[tensor_name])
|
|
|
|
module_report["outputs"][tensor_name] = comparison
|
|
|
|
if comparison.get("match", False):
|
|
module_report["summary"]["matching_tensors"] += 1
|
|
else:
|
|
module_report["summary"]["mismatched_tensors"] += 1
|
|
|
|
module_report["summary"]["total_tensors"] += 1
|
|
|
|
# Update module status
|
|
if module_report["summary"]["mismatched_tensors"] > 0:
|
|
step_report["summary"]["mismatched_modules"] += 1
|
|
else:
|
|
step_report["summary"]["matching_modules"] += 1
|
|
|
|
step_report["summary"]["total_modules"] += 1
|
|
step_report["modules"][module] = module_report
|
|
step_report["module_call_list"].append(module)
|
|
|
|
report["steps"][step] = step_report
|
|
|
|
# Add overall summary
|
|
report["summary"] = {
|
|
"total_steps": len(all_steps),
|
|
"total_modules": sum(
|
|
step_report["summary"]["total_modules"]
|
|
for step_report in report["steps"].values()
|
|
),
|
|
"matching_modules": sum(
|
|
step_report["summary"]["matching_modules"]
|
|
for step_report in report["steps"].values()
|
|
),
|
|
"mismatched_modules": sum(
|
|
step_report["summary"]["mismatched_modules"]
|
|
for step_report in report["steps"].values()
|
|
),
|
|
"missing_modules": sum(
|
|
step_report["summary"]["missing_modules"]
|
|
for step_report in report["steps"].values()
|
|
),
|
|
"total_tensors": sum(
|
|
module_report["summary"]["total_tensors"]
|
|
for step_report in report["steps"].values()
|
|
for module_name, module_report in step_report["modules"].items()
|
|
if "summary" in module_report
|
|
),
|
|
"matching_tensors": sum(
|
|
module_report["summary"]["matching_tensors"]
|
|
for step_report in report["steps"].values()
|
|
for module_name, module_report in step_report["modules"].items()
|
|
if "summary" in module_report
|
|
),
|
|
"mismatched_tensors": sum(
|
|
module_report["summary"]["mismatched_tensors"]
|
|
for step_report in report["steps"].values()
|
|
for module_name, module_report in step_report["modules"].items()
|
|
if "summary" in module_report
|
|
),
|
|
"missing_tensors": sum(
|
|
module_report["summary"]["missing_tensors"]
|
|
for step_report in report["steps"].values()
|
|
for module_name, module_report in step_report["modules"].items()
|
|
if "summary" in module_report
|
|
),
|
|
}
|
|
|
|
return report
|
|
|
|
|
|
def generate_markdown_report(report: Dict, verbose: bool = False) -> str:
|
|
"""Generate a markdown report from the comparison results."""
|
|
lines = []
|
|
|
|
# Add header
|
|
lines.append("# Intermediate Logging Comparison Report")
|
|
lines.append("")
|
|
lines.append("Comparing intermediate logging outputs between:")
|
|
lines.append(f"- **Directory 1**: `{report['dir1']}`")
|
|
lines.append(f"- **Directory 2**: `{report['dir2']}`")
|
|
lines.append("")
|
|
lines.append(f"Comparison parameters:")
|
|
lines.append(f"- Relative tolerance (rtol): {report['rtol']}")
|
|
lines.append(f"- Absolute tolerance (atol): {report['atol']}")
|
|
lines.append("")
|
|
|
|
# Add overall summary
|
|
summary = report["summary"]
|
|
lines.append("## Overall Summary")
|
|
lines.append("")
|
|
lines.append("| Category | Total | Matching | Mismatched | Missing |")
|
|
lines.append("|----------|-------|----------|------------|---------|")
|
|
lines.append(f"| Steps | {summary['total_steps']} | - | - | - |")
|
|
lines.append(
|
|
f"| Modules | {summary['total_modules']} | {summary['matching_modules']} | {summary['mismatched_modules']} | {summary['missing_modules']} |"
|
|
)
|
|
lines.append(
|
|
f"| Tensors | {summary['total_tensors']} | {summary['matching_tensors']} | {summary['mismatched_tensors']} | {summary['missing_tensors']} |"
|
|
)
|
|
lines.append("")
|
|
|
|
# Add step details
|
|
for step_name, step_report in sorted(report["steps"].items()):
|
|
step_summary = step_report["summary"]
|
|
|
|
lines.append(f"## {step_name}")
|
|
lines.append("")
|
|
lines.append(
|
|
f"**Summary**: {step_summary['matching_modules']} matching modules, {step_summary['mismatched_modules']} mismatched modules, {step_summary['missing_modules']} missing modules"
|
|
)
|
|
lines.append("")
|
|
|
|
# Add module details
|
|
for module_name in step_report["module_call_list"]:
|
|
module_report = step_report["modules"][module_name]
|
|
if "error" in module_report:
|
|
lines.append(f"### ❌ {module_name}")
|
|
lines.append("")
|
|
lines.append(f"**Error**: {module_report['error']}")
|
|
lines.append("")
|
|
continue
|
|
|
|
module_summary = module_report["summary"]
|
|
|
|
# Determine module status
|
|
if module_summary["mismatched_tensors"] > 0:
|
|
status = "❌"
|
|
else:
|
|
status = "✅"
|
|
|
|
lines.append(f"### {status} {module_name}")
|
|
lines.append("")
|
|
lines.append(
|
|
f"**Summary**: {module_summary['matching_tensors']} matching tensors, {module_summary['mismatched_tensors']} mismatched tensors, {module_summary['missing_tensors']} missing tensors"
|
|
)
|
|
lines.append("")
|
|
|
|
# Add metadata comparison results if available
|
|
for metadata_type in ["inputs_metadata", "outputs_metadata"]:
|
|
if metadata_type in module_report:
|
|
metadata_comparison = module_report[metadata_type]
|
|
if not metadata_comparison.get("match", True):
|
|
file_paths = ""
|
|
if (
|
|
"file1" in metadata_comparison
|
|
and "file2" in metadata_comparison
|
|
):
|
|
file_paths = f" - Files: `{metadata_comparison['file1']}` vs `{metadata_comparison['file2']}`"
|
|
|
|
lines.append(
|
|
f"**{metadata_type.capitalize()}**: Mismatch detected{file_paths}"
|
|
)
|
|
if verbose and "mismatches" in metadata_comparison:
|
|
lines.append("```json")
|
|
lines.append(
|
|
json.dumps(metadata_comparison["mismatches"], indent=2)
|
|
)
|
|
lines.append("```")
|
|
lines.append("")
|
|
|
|
# Add tensor comparison details
|
|
if module_summary["mismatched_tensors"] > 0 or verbose:
|
|
# Add input tensor details
|
|
if module_report["inputs"]:
|
|
lines.append("#### Input Tensors")
|
|
lines.append("")
|
|
lines.append("| Tensor | Status | Details |")
|
|
lines.append("|--------|--------|---------|")
|
|
|
|
for tensor_name, comparison in sorted(
|
|
module_report["inputs"].items()
|
|
):
|
|
if comparison.get("match", False):
|
|
status = "✅"
|
|
details = "Tensors match"
|
|
elif "error" in comparison:
|
|
status = "❌"
|
|
details = comparison["error"]
|
|
else:
|
|
status = "❌"
|
|
details = f"Max abs diff: {comparison.get('max_abs_diff', 'N/A'):.2e}, "
|
|
details = f"Max relative diff: {comparison.get('max_rel_diff', 'N/A'):.2e}, "
|
|
details += f"Diff elements: {comparison.get('num_diff_elements', 'N/A')}/{comparison.get('total_elements', 'N/A')}"
|
|
if "file1" in comparison and "file2" in comparison:
|
|
details += f"<br>Files: `{comparison['file1']}` vs `{comparison['file2']}`"
|
|
|
|
lines.append(f"| {tensor_name} | {status} | {details} |")
|
|
|
|
lines.append("")
|
|
|
|
# Add output tensor details
|
|
if module_report["outputs"]:
|
|
lines.append("#### Output Tensors")
|
|
lines.append("")
|
|
lines.append("| Tensor | Status | Details |")
|
|
lines.append("|--------|--------|---------|")
|
|
|
|
for tensor_name, comparison in sorted(
|
|
module_report["outputs"].items()
|
|
):
|
|
if comparison.get("match", False):
|
|
status = "✅"
|
|
details = "Tensors match"
|
|
elif "error" in comparison:
|
|
status = "❌"
|
|
details = comparison["error"]
|
|
else:
|
|
status = "❌"
|
|
details = f"Max abs diff: {comparison.get('max_abs_diff', 'N/A')}, "
|
|
details = f"Max relative diff: {comparison.get('max_rel_diff', 'N/A')}, "
|
|
details += f"Diff elements: {comparison.get('num_diff_elements', 'N/A')}/{comparison.get('total_elements', 'N/A')}"
|
|
|
|
lines.append(f"| {tensor_name} | {status} | {details} |")
|
|
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Compare intermediate logging outputs from two different runs."
|
|
)
|
|
parser.add_argument(
|
|
"--dir1", required=True, help="First intermediate logging directory"
|
|
)
|
|
parser.add_argument(
|
|
"--dir2", required=True, help="Second intermediate logging directory"
|
|
)
|
|
parser.add_argument("--output", help="Output file for the report (default: stdout)")
|
|
parser.add_argument(
|
|
"--rtol",
|
|
type=float,
|
|
default=None,
|
|
help="Relative tolerance for tensor comparison (default: 1e-5)",
|
|
)
|
|
parser.add_argument(
|
|
"--atol",
|
|
type=float,
|
|
default=None,
|
|
help="Absolute tolerance for tensor comparison (default: 1e-8)",
|
|
)
|
|
parser.add_argument(
|
|
"--steps", help="Comma-separated list of steps to compare (default: all)"
|
|
)
|
|
parser.add_argument(
|
|
"--modules",
|
|
help="Comma-separated list of module name patterns to compare (default: all)",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
action="store_true",
|
|
help="Include detailed information about each tensor",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Parse steps and modules
|
|
steps = args.steps.split(",") if args.steps else None
|
|
module_patterns = args.modules.split(",") if args.modules else None
|
|
|
|
# Compare directories
|
|
report = compare_directories(
|
|
Path(args.dir1),
|
|
Path(args.dir2),
|
|
rtol=args.rtol,
|
|
atol=args.atol,
|
|
steps=steps,
|
|
module_patterns=module_patterns,
|
|
)
|
|
|
|
# Generate report
|
|
output = generate_markdown_report(report, verbose=args.verbose)
|
|
|
|
# Write report
|
|
if args.output:
|
|
with open(args.output, "w") as f:
|
|
f.write(output)
|
|
print(f"Report written to {args.output}")
|
|
else:
|
|
print(output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
def invoke_main() -> None:
|
|
main()
|