{"generated":"2026-05-12","total_scored_runs":100,"stages":{"baseline":{"runs":20,"repos":5,"evidence_coverage_pct":78.7,"evidence_tag_relaxed_pct":90.78,"boundary_accuracy":0.95,"actionability":4,"hallucinations":0,"median_latency_s":133.8},"bgi_mcp":{"runs":20,"repos":5,"evidence_coverage_pct":84.9,"evidence_tag_relaxed_pct":94.2,"boundary_accuracy":1,"actionability":4,"hallucinations":0,"median_latency_s":66.2,"delta_vs_baseline":{"evidence_pp":6.2,"boundary":0.05,"latency_pct":-51}},"bgi_twin":{"runs":20,"repos":5,"evidence_coverage_pct_mean":79.9,"evidence_coverage_pct_p04":96,"evidence_tag_relaxed_pct_mean":94.84,"evidence_tag_relaxed_pct_p04":100,"boundary_accuracy":1,"actionability":4.75,"hallucinations":0,"median_latency_s":68.5,"delta_vs_bgi_mcp":{"actionability":0.75},"tools":["task_fingerprint","behavioral_twins","twin_context"],"mcp_invocation_evidence":"CallToolRequest confirmed in all 20 runs"},"bgi_twin_replication_gpt4o":{"runs":20,"repos":5,"evidence_coverage_pct_mean":47.9,"evidence_coverage_pct_p04":49.3,"evidence_tag_relaxed_pct_mean":59.48,"evidence_tag_relaxed_pct_p04":62.72,"boundary_accuracy":1,"actionability":4.85,"hallucinations":0,"median_latency_s":41.55,"model":"azure/gpt-4o","notes":"Independent-model replication of full TWIN refresh (p01-p04 x 5 repos)"},"bgi_twin_replication_gemini_auto":{"runs":20,"repos":5,"evidence_coverage_pct_mean":62.36,"evidence_tag_relaxed_pct_mean":83.41,"boundary_accuracy":0.95,"actionability":4.25,"hallucinations":0,"median_latency_s":65.75,"model":"gemini/auto","notes":"Independent-model replication on Gemini CLI auto mode; django/p02 is one genuine architectural miss (depth-first on query.py), all others correct"}},"repos":["tiangolo/fastapi","django/django","pydantic/pydantic-core","prometheus/prometheus","vercel/next.js"],"cli":"opencode 1.14.41 / gemini CLI (auto)","model":"deepseek-v4-flash + azure/gpt-4o + gemini/auto","rubric":"https://github.com/ahmedxuhri/bigindexer/blob/master/validation/SCORING_RUBRIC.md","raw_outputs":"https://github.com/ahmedxuhri/bigindexer/tree/master/validation/runs"}