import random
from multivon_eval import EvalCase, EvalSuite, Experiment
cases = [
EvalCase(
input="Summarize the weekly product update.",
context=(
"This week the mobile team shipped a new onboarding flow, fixed two crash bugs, "
"and delayed analytics export until next week."
),
),
EvalCase(
input="Summarize the finance memo.",
context=(
"Finance held the 2026 hiring plan flat, reduced travel by 12%, "
"and approved a new vendor review process."
),
),
EvalCase(
input="Summarize the incident review.",
context=(
"The API outage lasted 14 minutes, the root cause was an expired TLS certificate, "
"and the team added automated certificate rotation."
),
),
]
summary_options_v1 = {
"Summarize the weekly product update.": [
"The team shipped a new onboarding flow, fixed two crash bugs, and delayed analytics export until next week.",
"The team shipped a new onboarding flow, fixed two crash bugs, and launched analytics export worldwide.",
],
"Summarize the finance memo.": [
"Finance kept the 2026 hiring plan flat, cut travel by 12%, and approved a vendor review process.",
"Finance expanded hiring by 12%, increased travel, and approved a vendor review process.",
],
"Summarize the incident review.": [
"The outage lasted 14 minutes, came from an expired TLS certificate, and led to automated certificate rotation.",
"The outage lasted 14 minutes, was caused by a database failover, and led to automated certificate rotation.",
],
}
summary_options_v2 = {
"Summarize the weekly product update.": [
"The team shipped a new onboarding flow, fixed two crash bugs, and delayed analytics export until next week.",
"This week's update focused on onboarding improvements, crash fixes, and delaying analytics export until next week.",
],
"Summarize the finance memo.": [
"Finance kept the 2026 hiring plan flat, cut travel by 12%, and approved a vendor review process.",
"The memo says hiring stays flat for 2026, travel drops by 12%, and vendor review is now required.",
],
"Summarize the incident review.": [
"The outage lasted 14 minutes, came from an expired TLS certificate, and led to automated certificate rotation.",
"The review says a 14-minute outage was traced to an expired TLS certificate and prevented with certificate rotation.",
],
}
def summarizer_v1(prompt: str) -> str:
return random.choice(summary_options_v1[prompt])
def summarizer_v2(prompt: str) -> str:
return random.choice(summary_options_v2[prompt])
suite = EvalSuite.for_summarization("Summarizer Stability Eval")
suite.add_cases(cases)
exp = Experiment("summarizer-prompt-change")
report_v1 = suite.run(summarizer_v1, runs=5, verbose=True)
run_v1 = exp.record(report_v1, tags={"prompt_version": "before", "temperature": "0.7"})
print(f"Flaky cases: {report_v1.flaky_count}")
print(f"Stability: {report_v1.stability_score:.0%}")
for case_result in report_v1.case_results:
if case_result.is_flaky:
print(
f" FLAKY: {case_result.case_input} "
f"({case_result.pass_count}/{case_result.runs} runs passed)"
)
report_v2 = suite.run(summarizer_v2, runs=5, verbose=True)
run_v2 = exp.record(report_v2, tags={"prompt_version": "after", "temperature": "0.2"})
exp.compare(run_v1, run_v2)