Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Policy,
RangeConfig,
RegressionPolicy,
ReportConfig,
)


Expand Down Expand Up @@ -37,7 +38,6 @@
"startTime": "{{ startTime }}",
"stopTime": "{{ stopTime }}",
"suites": ["torchbench", "huggingface", "timm_models"],
"workflowId": 0,
"branches": ["main"]
}
}
Expand Down Expand Up @@ -79,6 +79,9 @@
"issue": "7081",
},
),
report_config=ReportConfig(
report_level="no_regression",
),
)

BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import dataclasses
import json
from dataclasses import dataclass, field
from datetime import timedelta
Expand Down Expand Up @@ -223,6 +224,56 @@ def get_github_notification_config(self) -> Optional[GitHubNotificationConfig]:
return GitHubNotificationConfig.from_dict(self.notification_config)


ReportSeverity = Literal[
"none",
"no_regression",
"insufficient_data",
"suspicious",
"regression",
"unknown",
]


# Mapping from severity label → numeric level.
# Higher numbers mean more severe, used to compare and filter results before DB upload.
#
# Effects on DB upload:
# - "unknown": (-1) fallback for invalid input → excluded from DB
# - "none": (0) no report generated → results field is empty, nothing uploaded
# - "no_regression": (1) clean report → all results with severity >= 1 are uploaded
# - "insufficient_data": (2) weak signal → uploaded, also includes suspicious/regression
# - "suspicious": (3) medium signal → uploaded, also includes regression
# - "regression": (4) strongest severity → always uploaded
#
# This allows filtering: e.g., if threshold = 2, DB upload includes
# "insufficient_data", "suspicious", and "regression" but not "none" or "no_regression".
SEVERITY_ORDER: dict[ReportSeverity, int] = {
"unknown": -1, # fallback (bad/invalid input)
"none": 0, # no report generated, leads to no details uploaded to db in field results
"no_regression": 1, # no regression, leads to includes all the data with same/higher severity
"insufficient_data": 2, # weak signal
"suspicious": 3, # medium signal
"regression": 4, # strongest severity
}


@dataclass
class ReportConfig:
"""
decide what to include in db summary report
report_level: lowest level of regression to store in db
"""

report_level: ReportSeverity = "regression"

def get_severity_map(self) -> dict[ReportSeverity, int]:
# shadow copy is safe since this is a flat dict.
return SEVERITY_ORDER.copy()

def get_order(self) -> int:
return self.get_severity_map().get(self.report_level, -1)


# -------- Top-level benchmark regression config --------
@dataclass
class BenchmarkConfig:
Expand All @@ -242,6 +293,7 @@ class BenchmarkConfig:
source: BenchmarkApiSource
policy: Policy
hud_info: Optional[dict[str, Any]] = None
report_config: ReportConfig = field(default_factory=ReportConfig)


@dataclass
Expand All @@ -253,3 +305,13 @@ def __getitem__(self, key: str) -> BenchmarkConfig:
if not config:
raise KeyError(f"Config {key} not found")
return config


def to_dict(x: Any) -> Any:
if dataclasses.is_dataclass(x):
return {f.name: to_dict(getattr(x, f.name)) for f in dataclasses.fields(x)}
if isinstance(x, dict):
return {k: to_dict(v) for k, v in x.items()}
if isinstance(x, (list, tuple, set)):
return [to_dict(v) for v in x]
return x # primitive or already JSON-serializable
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
import dataclasses
import datetime as dt
import json
import logging
import uuid
from typing import Any, Dict

import clickhouse_connect
from common.config_model import BenchmarkConfig, Frequency
from common.regression_utils import (
BenchmarkRegressionReport,
get_regression_status,
PerGroupResult,
)
from common.config_model import BenchmarkConfig, ReportConfig, to_dict
from common.regression_utils import BenchmarkRegressionReport, get_regression_status
from jinja2 import Template


Expand Down Expand Up @@ -71,9 +66,6 @@
"""


logger = logging.getLogger()


class ReportManager:
"""
handles db insertion and notification processing
Expand All @@ -91,27 +83,28 @@ def __init__(
):
self.is_dry_run = is_dry_run

self.report = regression_report
self.raw_report = regression_report

self.config_id = config.id
self.config = config

self.type = type
self.repo = repo
self.db_table_name = db_table_name

self.id = str(uuid.uuid4())

# extract latest meta data from report
self.baseline = self.report["baseline_meta_data"]
self.target = self.report["new_meta_data"]
self.baseline = self.raw_report["baseline_meta_data"]
self.target = self.raw_report["new_meta_data"]
self.target_latest_commit = self.target["end"]["commit"]
self.target_latest_ts_str = self.target["end"]["timestamp"]
self.status = get_regression_status(self.report["summary"])
self.status = get_regression_status(self.raw_report["summary"])

self.report_data = self._to_report_data(
config_id=config.id,
regression_report=self.report,
frequency=self.config.policy.frequency,
regression_report=self.raw_report,
config=self.config,
status=self.status,
)

def run(
Expand Down Expand Up @@ -146,7 +139,7 @@ def notify_github_comment(self, github_token: str):
)
return
logger.info("[%s] prepareing gitub comment content", self.config_id)
content = self._to_markdoown()
content = self._to_markdown()
if self.is_dry_run:
logger.info(
"[%s]dry run, skip sending comment to github, report(%s)",
Expand All @@ -161,31 +154,24 @@ def notify_github_comment(self, github_token: str):
github_notification.create_github_comment(content, github_token)
logger.info("[%s] done. comment is sent to github", self.config_id)

def _to_markdoown(self):
self.regression_items = self._collect_regression_items()
url = ""
if self.config.hud_info:
url = self.config.hud_info.get("url", "")

md = Template(REPORT_MD_TEMPLATE, trim_blocks=True, lstrip_blocks=True).render(
def _to_markdown(self) -> str:
regression_items = [
r for r in self.raw_report["results"] if r.get("label") == "regression"
]
url = (self.config.hud_info or {}).get("url", "")
return Template(
REPORT_MD_TEMPLATE, trim_blocks=True, lstrip_blocks=True
).render(
id=self.id,
url=url,
status=self.status,
report_id=self.config_id,
summary=self.report["summary"],
summary=self.raw_report["summary"],
baseline=self.baseline,
target=self.target,
frequency=self.config.policy.frequency.get_text(),
regression_items=self.regression_items,
regression_items=regression_items,
)
return md

def _collect_regression_items(self) -> list[PerGroupResult]:
items = []
for item in self.report["results"]:
if item["label"] == "regression":
items.append(item)
return items

def insert_to_db(
self,
Expand All @@ -205,7 +191,7 @@ def insert_to_db(

try:
report_json = json.dumps(
self.report, ensure_ascii=False, separators=(",", ":"), default=str
self.report_data, ensure_ascii=False, separators=(",", ":"), default=str
)
except Exception:
logger.exception(
Expand All @@ -214,12 +200,12 @@ def insert_to_db(
)
raise

regression_summary = self.report["summary"]
regression_summary = self.raw_report["summary"]
params = {
"id": str(self.id),
"report_id": self.config_id,
"type": self.type,
"status": get_regression_status(self.report["summary"]),
"status": get_regression_status(self.raw_report["summary"]),
"last_record_commit": self.target_latest_commit,
"last_record_ts": last_record_ts,
"regression_count": regression_summary["regression_count"],
Expand All @@ -238,9 +224,10 @@ def insert_to_db(
self.config_id,
self.id,
)
logger.info("[dry run] printing db params data")
if self.is_dry_run:
print(json.dumps(params, indent=2, default=str))
logger.info("[dry run] printing db params data")
print({k: v for k, v in params.items() if k != "report_json"})
print(json.dumps(self.report_data, indent=2, default=str))
logger.info("[dry run] Done! Finish printing db params data")
return False
logger.info(
Expand Down Expand Up @@ -382,28 +369,32 @@ def _validate_latest_meta_info(
def _to_report_data(
self,
config_id: str,
config: BenchmarkConfig,
regression_report: BenchmarkRegressionReport,
frequency: Frequency,
status: str,
) -> dict[str, Any]:
if not self.target_latest_commit:
raise ValueError(
f"missing commit from new is required, latest is {self.target}"
)
lastest_ts_str = self.target_latest_ts_str
if not lastest_ts_str:
raise ValueError(f"timestamp from new is required, latest is {self.target}")

def to_dict(x): # handle dataclass or dict/object
if dataclasses.is_dataclass(x):
return dataclasses.asdict(x)
if isinstance(x, dict):
return x
return vars(x) if hasattr(x, "__dict__") else {"value": str(x)}

report = to_dict(regression_report)
filtered = self._filter_report(regression_report, config.report_config)
logger.info("policy: %s", to_dict(self.config.policy))
return {
"status": self.status,
"status": status,
"report_id": config_id,
"report": report,
"frequency": frequency.get_text(),
"policy": to_dict(config.policy),
"report": to_dict(filtered),
}

def _filter_report(
self, report: BenchmarkRegressionReport, report_config: ReportConfig
) -> dict[str, Any]:
new_report = {k: v for k, v in report.items() if k != "results"}
level_order = report_config.get_severity_map()
threshold = report_config.get_order()

new_report["results"] = [
r for r in report["results"] if level_order[r["label"]] >= threshold
]
return new_report
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python
import argparse
import datetime as dt
import json
import logging
import os
import threading
Expand Down Expand Up @@ -155,8 +154,9 @@ def process(
config=config, target_ts=target, baseline_ts=baseline
)
regression_report = generator.generate()
if self.is_dry_run:
print(json.dumps(regression_report, indent=2, default=str))
# debugging only
# if self.is_dry_run:
# print(json.dumps(regression_report, indent=2, default=str))
reportManager = ReportManager(
config=config,
regression_report=regression_report,
Expand Down