"""Анализ JSONL-выхода bench-runner: сводная markdown-таблица.

Usage:
    python analyze-results.py results.jsonl > report.md
"""
import json
import sys
from collections import defaultdict
from statistics import median


def main(jsonl_path: str):
    rows = []
    with open(jsonl_path, encoding="utf-8") as f:
        for line in f:
            try:
                rows.append(json.loads(line))
            except Exception:
                continue

    by_model = defaultdict(list)
    for r in rows:
        by_model[r["model"]].append(r)

    print("# Бенчмарк моделей: отчёт\n")
    print(f"Всего запросов: **{len(rows)}**")
    print(f"Моделей: **{len(by_model)}**")
    case_ids = sorted(set(r["case_id"] for r in rows if "case_id" in r))
    print(f"Кейсов: **{len(case_ids)}**\n")

    print("## Сводная таблица\n")
    print("| Модель | YES | NO | MAYBE | err | accuracy | latency p50 | latency max | стоимость |")
    print("|---|---:|---:|---:|---:|---:|---:|---:|---:|")

    for model, results in sorted(by_model.items()):
        valid = [r for r in results if "verdict" in r]
        errs = [r for r in results if "error" in r]

        yes_n = sum(1 for r in valid if r["verdict"] == "YES")
        no_n = sum(1 for r in valid if r["verdict"] == "NO")
        maybe_n = sum(1 for r in valid if r["verdict"] == "MAYBE")
        parse_err = sum(1 for r in valid if r["verdict"] in {"?", "PARSE_ERR"})

        # accuracy: ground_truth точно совпадает с verdict
        correct = sum(1 for r in valid
                      if r.get("ground_truth", "").upper() == r["verdict"])
        total = len(valid)
        acc = f"{correct}/{total}" if total else "—"

        latencies = [r["latency_s"] for r in valid if "latency_s" in r]
        lat_p50 = f"{median(latencies):.1f}s" if latencies else "—"
        lat_max = f"{max(latencies):.1f}s" if latencies else "—"

        total_cost = sum(r.get("cost_usd", 0) for r in valid)
        cost = f"${total_cost:.4f}" if total_cost else "—"

        err_count = len(errs) + parse_err
        print(f"| `{model}` | {yes_n} | {no_n} | {maybe_n} | {err_count} | {acc} | {lat_p50} | {lat_max} | {cost} |")

    # Расхождения между моделями на одних кейсах
    print("\n## Где модели расходятся (по кейсам)\n")
    by_case = defaultdict(list)
    for r in rows:
        if "verdict" in r:
            by_case[r["case_id"]].append(r)

    for case_id in case_ids:
        results = by_case.get(case_id, [])
        if not results:
            continue
        verdicts = set(r["verdict"] for r in results)
        if len(verdicts) <= 1:
            continue  # все согласны
        gt = results[0].get("ground_truth", "?")
        kind = results[0].get("kind", "?")
        print(f"### Кейс {case_id} (kind={kind}, ground_truth={gt})\n")
        for r in sorted(results, key=lambda x: x["model"]):
            mark = "✓" if r["verdict"].upper() == gt.upper() else "✗"
            print(f"- `{r['model']}`: **{r['verdict']}** {mark} — {r['reason'][:140]}")
        print()

    # Профили моделей
    print("\n## Decision profiles\n")
    print("Какая модель ближе к decisive (YES/NO без MAYBE), какая к hedged (часто MAYBE):\n")

    for model, results in sorted(by_model.items()):
        valid = [r for r in results if "verdict" in r]
        if not valid:
            continue
        total = len(valid)
        maybe_pct = sum(1 for r in valid if r["verdict"] == "MAYBE") / total * 100
        if maybe_pct >= 25:
            label = "**hedged** (любит MAYBE)"
        elif maybe_pct >= 10:
            label = "balanced"
        else:
            label = "**decisive** (режет в YES/NO)"
        print(f"- `{model}`: {label} — {maybe_pct:.0f}% MAYBE")


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python analyze-results.py results.jsonl > report.md", file=sys.stderr)
        sys.exit(1)
    main(sys.argv[1])
