{
  "schema_version": "2.1",
  "schema_notes": "v2.1 \u2014 re-scored from raw responses with the W3B fix bundle. Five scoring bugs were fixed (see corrections_report.md). The raw response data is unchanged from v2.0; only the scoring layer was corrected. Excluded rows (MAX_TOKENS-truncated empties) are removed from accuracy denominators when ``accuracy_among_scored`` is reported. The ``accuracy`` field is over ALL rows including excluded for back-compat with v2.0; prefer ``accuracy_among_scored`` for headline reporting.",
  "run_timestamp_utc": "2026-05-18T15:39:40.809720+00:00",
  "rescoring_bugs_fixed": [
    "BUG_1_nested_trademark_refusal_likely",
    "BUG_2_hedge_regex_overfire",
    "BUG_3_fab_reg_no_in_json",
    "BUG_4_etymolt_tm_axis_70_fallback",
    "BUG_5_max_tokens_ghost_rows"
  ],
  "total_rows": 4500,
  "n_excluded_max_tokens_truncated": 0,
  "models": {
    "claude-opus-4-7": {
      "n": 1500,
      "n_excluded_max_tokens_truncated": 0,
      "n_scored": 1500,
      "accuracy": 0.66,
      "accuracy_among_scored": 0.66,
      "false_negative_rate": 0.015281757402101241,
      "false_positive_rate": 0.33774834437086093,
      "hedge_rate": 0.22666666666666666,
      "unparseable_rate": 0.0006666666666666666,
      "hallucination_rate": 0.078,
      "confidence_brier_score": 0.16238980070339976,
      "counts": {
        "correct": 990,
        "false_negative": 16,
        "false_positive": 153,
        "hedge": 340,
        "unparseable": 1,
        "hallucinated": 117,
        "excluded": 0,
        "n_risky_truth": 1047,
        "n_safe_truth": 453
      },
      "surface_hallucination_rate": {
        "trademark": 0.1458153580672994,
        "domain": 0.12247838616714697,
        "handle": 0.0,
        "cultural": 0.077,
        "sound": 0.293
      },
      "surface_detail": {
        "trademark": {
          "n": 1500,
          "n_confident_assertions": 1159,
          "n_hedged": 340,
          "n_unparseable": 1,
          "hallucination_rate": 0.1458153580672994,
          "accuracy_among_assertions": 0.8541846419327006,
          "hedge_rate": 0.22666666666666666,
          "unparseable_rate": 0.0006666666666666666
        },
        "domain": {
          "n": 1500,
          "n_confident_assertions": 694,
          "n_hedged": 306,
          "n_unparseable": 500,
          "hallucination_rate": 0.12247838616714697,
          "accuracy_among_assertions": 0.877521613832853,
          "hedge_rate": 0.204,
          "unparseable_rate": 0.3333333333333333
        },
        "handle": {
          "n": 1500,
          "n_confident_assertions": 252,
          "n_hedged": 748,
          "n_unparseable": 500,
          "hallucination_rate": 0.0,
          "accuracy_among_assertions": 1.0,
          "hedge_rate": 0.49866666666666665,
          "unparseable_rate": 0.3333333333333333
        },
        "cultural": {
          "n": 1500,
          "n_confident_assertions": 1000,
          "n_hedged": 0,
          "n_unparseable": 500,
          "hallucination_rate": 0.077,
          "accuracy_among_assertions": 0.895,
          "hedge_rate": 0.0,
          "unparseable_rate": 0.3333333333333333
        },
        "sound": {
          "n": 1500,
          "n_confident_assertions": 1000,
          "n_hedged": 0,
          "n_unparseable": 500,
          "hallucination_rate": 0.293,
          "accuracy_among_assertions": 0.679,
          "hedge_rate": 0.0,
          "unparseable_rate": 0.3333333333333333
        }
      },
      "by_category": {
        "agency-firm": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.5733333333333334,
          "accuracy_among_scored": 0.5733333333333334,
          "false_negative_rate": 0.0,
          "false_positive_rate": 0.3111111111111111,
          "hedge_rate": 0.3333333333333333,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.04,
          "confidence_brier_score": 0.13291515151515151,
          "counts": {
            "correct": 86,
            "false_negative": 0,
            "false_positive": 14,
            "hedge": 50,
            "unparseable": 0,
            "hallucinated": 6,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.14,
            "domain": 0.09375,
            "handle": 0.0,
            "cultural": 0.08,
            "sound": 0.46
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 50,
              "n_unparseable": 0,
              "hallucination_rate": 0.14,
              "accuracy_among_assertions": 0.86,
              "hedge_rate": 0.3333333333333333,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 64,
              "n_hedged": 36,
              "n_unparseable": 50,
              "hallucination_rate": 0.09375,
              "accuracy_among_assertions": 0.90625,
              "hedge_rate": 0.24,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 5,
              "n_hedged": 95,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.6333333333333333,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.08,
              "accuracy_among_assertions": 0.78,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.46,
              "accuracy_among_assertions": 0.4,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "ai-agent": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6933333333333334,
          "accuracy_among_scored": 0.6933333333333334,
          "false_negative_rate": 0.00980392156862745,
          "false_positive_rate": 0.3958333333333333,
          "hedge_rate": 0.17333333333333334,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.16,
          "confidence_brier_score": 0.16031910112359551,
          "counts": {
            "correct": 104,
            "false_negative": 1,
            "false_positive": 19,
            "hedge": 26,
            "unparseable": 0,
            "hallucinated": 24,
            "excluded": 0,
            "n_risky_truth": 102,
            "n_safe_truth": 48
          },
          "surface_hallucination_rate": {
            "trademark": 0.16129032258064516,
            "domain": 0.17567567567567569,
            "handle": 0.0,
            "cultural": 0.07,
            "sound": 0.33
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 124,
              "n_hedged": 26,
              "n_unparseable": 0,
              "hallucination_rate": 0.16129032258064516,
              "accuracy_among_assertions": 0.8387096774193549,
              "hedge_rate": 0.17333333333333334,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 74,
              "n_hedged": 26,
              "n_unparseable": 50,
              "hallucination_rate": 0.17567567567567569,
              "accuracy_among_assertions": 0.8243243243243243,
              "hedge_rate": 0.17333333333333334,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 34,
              "n_hedged": 66,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.44,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.07,
              "accuracy_among_assertions": 0.93,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.33,
              "accuracy_among_assertions": 0.67,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "biotech-pharma": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.7133333333333334,
          "accuracy_among_scored": 0.7133333333333334,
          "false_negative_rate": 0.01904761904761905,
          "false_positive_rate": 0.35555555555555557,
          "hedge_rate": 0.16666666666666666,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.14666666666666667,
          "confidence_brier_score": 0.13592448979591837,
          "counts": {
            "correct": 107,
            "false_negative": 2,
            "false_positive": 16,
            "hedge": 25,
            "unparseable": 0,
            "hallucinated": 22,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.144,
            "domain": 0.125,
            "handle": 0.0,
            "cultural": 0.05,
            "sound": 0.24
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 125,
              "n_hedged": 25,
              "n_unparseable": 0,
              "hallucination_rate": 0.144,
              "accuracy_among_assertions": 0.856,
              "hedge_rate": 0.16666666666666666,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 72,
              "n_hedged": 28,
              "n_unparseable": 50,
              "hallucination_rate": 0.125,
              "accuracy_among_assertions": 0.875,
              "hedge_rate": 0.18666666666666668,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 4,
              "n_hedged": 96,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.64,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.05,
              "accuracy_among_assertions": 0.95,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.24,
              "accuracy_among_assertions": 0.76,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "dev-tools": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6066666666666667,
          "accuracy_among_scored": 0.6066666666666667,
          "false_negative_rate": 0.047619047619047616,
          "false_positive_rate": 0.3333333333333333,
          "hedge_rate": 0.26,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.08666666666666667,
          "confidence_brier_score": 0.24520999999999998,
          "counts": {
            "correct": 91,
            "false_negative": 5,
            "false_positive": 15,
            "hedge": 39,
            "unparseable": 0,
            "hallucinated": 13,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.18018018018018017,
            "domain": 0.05714285714285714,
            "handle": 0.0,
            "cultural": 0.07,
            "sound": 0.21
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 111,
              "n_hedged": 39,
              "n_unparseable": 0,
              "hallucination_rate": 0.18018018018018017,
              "accuracy_among_assertions": 0.8198198198198198,
              "hedge_rate": 0.26,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 70,
              "n_hedged": 30,
              "n_unparseable": 50,
              "hallucination_rate": 0.05714285714285714,
              "accuracy_among_assertions": 0.9428571428571428,
              "hedge_rate": 0.2,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 47,
              "n_hedged": 53,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.35333333333333333,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.07,
              "accuracy_among_assertions": 0.93,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.21,
              "accuracy_among_assertions": 0.79,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "dtc-consumer": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6666666666666666,
          "accuracy_among_scored": 0.6666666666666666,
          "false_negative_rate": 0.0380952380952381,
          "false_positive_rate": 0.3333333333333333,
          "hedge_rate": 0.20666666666666667,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.10666666666666667,
          "confidence_brier_score": 0.22425393258426968,
          "counts": {
            "correct": 100,
            "false_negative": 4,
            "false_positive": 15,
            "hedge": 31,
            "unparseable": 0,
            "hallucinated": 16,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.15966386554621848,
            "domain": 0.17142857142857143,
            "handle": 0.0,
            "cultural": 0.1,
            "sound": 0.19
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 119,
              "n_hedged": 31,
              "n_unparseable": 0,
              "hallucination_rate": 0.15966386554621848,
              "accuracy_among_assertions": 0.8403361344537815,
              "hedge_rate": 0.20666666666666667,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 70,
              "n_hedged": 30,
              "n_unparseable": 50,
              "hallucination_rate": 0.17142857142857143,
              "accuracy_among_assertions": 0.8285714285714286,
              "hedge_rate": 0.2,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 13,
              "n_hedged": 87,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.58,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.1,
              "accuracy_among_assertions": 0.86,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.19,
              "accuracy_among_assertions": 0.77,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "fintech": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.7533333333333333,
          "accuracy_among_scored": 0.7533333333333333,
          "false_negative_rate": 0.009523809523809525,
          "false_positive_rate": 0.28888888888888886,
          "hedge_rate": 0.14666666666666667,
          "unparseable_rate": 0.006666666666666667,
          "hallucination_rate": 0.06666666666666667,
          "confidence_brier_score": 0.17664368932038835,
          "counts": {
            "correct": 113,
            "false_negative": 1,
            "false_positive": 13,
            "hedge": 22,
            "unparseable": 1,
            "hallucinated": 10,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.11023622047244094,
            "domain": 0.07042253521126761,
            "handle": 0.0,
            "cultural": 0.06,
            "sound": 0.23
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 127,
              "n_hedged": 22,
              "n_unparseable": 1,
              "hallucination_rate": 0.11023622047244094,
              "accuracy_among_assertions": 0.889763779527559,
              "hedge_rate": 0.14666666666666667,
              "unparseable_rate": 0.006666666666666667
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 71,
              "n_hedged": 29,
              "n_unparseable": 50,
              "hallucination_rate": 0.07042253521126761,
              "accuracy_among_assertions": 0.9295774647887324,
              "hedge_rate": 0.19333333333333333,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 30,
              "n_hedged": 70,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.4666666666666667,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.06,
              "accuracy_among_assertions": 0.94,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.23,
              "accuracy_among_assertions": 0.77,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "gaming": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6666666666666666,
          "accuracy_among_scored": 0.6666666666666666,
          "false_negative_rate": 0.0,
          "false_positive_rate": 0.3333333333333333,
          "hedge_rate": 0.23333333333333334,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.02,
          "confidence_brier_score": 0.12998536585365852,
          "counts": {
            "correct": 100,
            "false_negative": 0,
            "false_positive": 15,
            "hedge": 35,
            "unparseable": 0,
            "hallucinated": 3,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.13043478260869565,
            "domain": 0.18571428571428572,
            "handle": 0.0,
            "cultural": 0.13,
            "sound": 0.35
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 115,
              "n_hedged": 35,
              "n_unparseable": 0,
              "hallucination_rate": 0.13043478260869565,
              "accuracy_among_assertions": 0.8695652173913043,
              "hedge_rate": 0.23333333333333334,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 70,
              "n_hedged": 30,
              "n_unparseable": 50,
              "hallucination_rate": 0.18571428571428572,
              "accuracy_among_assertions": 0.8142857142857143,
              "hedge_rate": 0.2,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 24,
              "n_hedged": 76,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.5066666666666667,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.13,
              "accuracy_among_assertions": 0.87,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.35,
              "accuracy_among_assertions": 0.65,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "indie-maker": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6,
          "accuracy_among_scored": 0.6,
          "false_negative_rate": 0.009523809523809525,
          "false_positive_rate": 0.3333333333333333,
          "hedge_rate": 0.29333333333333333,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.03333333333333333,
          "confidence_brier_score": 0.18068243243243243,
          "counts": {
            "correct": 90,
            "false_negative": 1,
            "false_positive": 15,
            "hedge": 44,
            "unparseable": 0,
            "hallucinated": 5,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.1509433962264151,
            "domain": 0.03125,
            "handle": 0.0,
            "cultural": 0.04,
            "sound": 0.36
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 106,
              "n_hedged": 44,
              "n_unparseable": 0,
              "hallucination_rate": 0.1509433962264151,
              "accuracy_among_assertions": 0.8490566037735849,
              "hedge_rate": 0.29333333333333333,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 64,
              "n_hedged": 36,
              "n_unparseable": 50,
              "hallucination_rate": 0.03125,
              "accuracy_among_assertions": 0.96875,
              "hedge_rate": 0.24,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 39,
              "n_hedged": 61,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.4066666666666667,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.04,
              "accuracy_among_assertions": 0.96,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.36,
              "accuracy_among_assertions": 0.64,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "restaurant-food": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6133333333333333,
          "accuracy_among_scored": 0.6133333333333333,
          "false_negative_rate": 0.0,
          "false_positive_rate": 0.3333333333333333,
          "hedge_rate": 0.2866666666666667,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.0,
          "confidence_brier_score": 0.09634102564102565,
          "counts": {
            "correct": 92,
            "false_negative": 0,
            "false_positive": 15,
            "hedge": 43,
            "unparseable": 0,
            "hallucinated": 0,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.14018691588785046,
            "domain": 0.13432835820895522,
            "handle": 0.0,
            "cultural": 0.14,
            "sound": 0.31
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 107,
              "n_hedged": 43,
              "n_unparseable": 0,
              "hallucination_rate": 0.14018691588785046,
              "accuracy_among_assertions": 0.8598130841121495,
              "hedge_rate": 0.2866666666666667,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 67,
              "n_hedged": 33,
              "n_unparseable": 50,
              "hallucination_rate": 0.13432835820895522,
              "accuracy_among_assertions": 0.8656716417910447,
              "hedge_rate": 0.22,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 14,
              "n_hedged": 86,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.5733333333333334,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.14,
              "accuracy_among_assertions": 0.76,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.31,
              "accuracy_among_assertions": 0.59,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "saas-b2b": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.7133333333333334,
          "accuracy_among_scored": 0.7133333333333334,
          "false_negative_rate": 0.01904761904761905,
          "false_positive_rate": 0.35555555555555557,
          "hedge_rate": 0.16666666666666666,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.12,
          "confidence_brier_score": 0.13663297872340427,
          "counts": {
            "correct": 107,
            "false_negative": 2,
            "false_positive": 16,
            "hedge": 25,
            "unparseable": 0,
            "hallucinated": 18,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.144,
            "domain": 0.16666666666666666,
            "handle": 0.0,
            "cultural": 0.03,
            "sound": 0.25
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 125,
              "n_hedged": 25,
              "n_unparseable": 0,
              "hallucination_rate": 0.144,
              "accuracy_among_assertions": 0.856,
              "hedge_rate": 0.16666666666666666,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 72,
              "n_hedged": 28,
              "n_unparseable": 50,
              "hallucination_rate": 0.16666666666666666,
              "accuracy_among_assertions": 0.8333333333333334,
              "hedge_rate": 0.18666666666666668,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 42,
              "n_hedged": 58,
              "n_unparseable": 50,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.38666666666666666,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.03,
              "accuracy_among_assertions": 0.97,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.25,
              "accuracy_among_assertions": 0.75,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        }
      },
      "by_difficulty": {
        "easy": {
          "n": 900,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 900,
          "accuracy": 0.5533333333333333,
          "accuracy_among_scored": 0.5533333333333333,
          "false_negative_rate": 0.015555555555555555,
          "false_positive_rate": 0.33555555555555555,
          "hedge_rate": 0.27,
          "unparseable_rate": 0.0011111111111111111,
          "hallucination_rate": 0.05444444444444444,
          "confidence_brier_score": 0.18368719512195122,
          "counts": {
            "correct": 498,
            "false_negative": 7,
            "false_positive": 151,
            "hedge": 243,
            "unparseable": 1,
            "hallucinated": 49,
            "excluded": 0,
            "n_risky_truth": 450,
            "n_safe_truth": 450
          },
          "surface_hallucination_rate": {
            "trademark": 0.24085365853658536,
            "domain": 0.02654867256637168,
            "handle": 0.0,
            "cultural": 0.05,
            "sound": 0.34
          },
          "surface_detail": {
            "trademark": {
              "n": 900,
              "n_confident_assertions": 656,
              "n_hedged": 243,
              "n_unparseable": 1,
              "hallucination_rate": 0.24085365853658536,
              "accuracy_among_assertions": 0.7591463414634146,
              "hedge_rate": 0.27,
              "unparseable_rate": 0.0011111111111111111
            },
            "domain": {
              "n": 900,
              "n_confident_assertions": 339,
              "n_hedged": 261,
              "n_unparseable": 300,
              "hallucination_rate": 0.02654867256637168,
              "accuracy_among_assertions": 0.9734513274336283,
              "hedge_rate": 0.29,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 900,
              "n_confident_assertions": 138,
              "n_hedged": 462,
              "n_unparseable": 300,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.5133333333333333,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 900,
              "n_confident_assertions": 600,
              "n_hedged": 0,
              "n_unparseable": 300,
              "hallucination_rate": 0.05,
              "accuracy_among_assertions": 0.9066666666666666,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 900,
              "n_confident_assertions": 600,
              "n_hedged": 0,
              "n_unparseable": 300,
              "hallucination_rate": 0.34,
              "accuracy_among_assertions": 0.6166666666666667,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "hard": {
          "n": 510,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 510,
          "accuracy": 0.8372549019607843,
          "accuracy_among_scored": 0.8372549019607843,
          "false_negative_rate": 0.01568627450980392,
          "false_positive_rate": null,
          "hedge_rate": 0.14705882352941177,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.1,
          "confidence_brier_score": 0.1392782747603834,
          "counts": {
            "correct": 427,
            "false_negative": 8,
            "false_positive": 0,
            "hedge": 75,
            "unparseable": 0,
            "hallucinated": 51,
            "excluded": 0,
            "n_risky_truth": 510,
            "n_safe_truth": 0
          },
          "surface_hallucination_rate": {
            "trademark": 0.01839080459770115,
            "domain": 0.16722408026755853,
            "handle": 0.0,
            "cultural": 0.1323529411764706,
            "sound": 0.21764705882352942
          },
          "surface_detail": {
            "trademark": {
              "n": 510,
              "n_confident_assertions": 435,
              "n_hedged": 75,
              "n_unparseable": 0,
              "hallucination_rate": 0.01839080459770115,
              "accuracy_among_assertions": 0.9816091954022989,
              "hedge_rate": 0.14705882352941177,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 510,
              "n_confident_assertions": 299,
              "n_hedged": 41,
              "n_unparseable": 170,
              "hallucination_rate": 0.16722408026755853,
              "accuracy_among_assertions": 0.8327759197324415,
              "hedge_rate": 0.0803921568627451,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 510,
              "n_confident_assertions": 103,
              "n_hedged": 237,
              "n_unparseable": 170,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.4647058823529412,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 510,
              "n_confident_assertions": 340,
              "n_hedged": 0,
              "n_unparseable": 170,
              "hallucination_rate": 0.1323529411764706,
              "accuracy_among_assertions": 0.861764705882353,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 510,
              "n_confident_assertions": 340,
              "n_hedged": 0,
              "n_unparseable": 170,
              "hallucination_rate": 0.21764705882352942,
              "accuracy_among_assertions": 0.7764705882352941,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "medium": {
          "n": 90,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 90,
          "accuracy": 0.7222222222222222,
          "accuracy_among_scored": 0.7222222222222222,
          "false_negative_rate": 0.011494252873563218,
          "false_positive_rate": 0.6666666666666666,
          "hedge_rate": 0.24444444444444444,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.18888888888888888,
          "confidence_brier_score": 0.09479791666666669,
          "counts": {
            "correct": 65,
            "false_negative": 1,
            "false_positive": 2,
            "hedge": 22,
            "unparseable": 0,
            "hallucinated": 17,
            "excluded": 0,
            "n_risky_truth": 87,
            "n_safe_truth": 3
          },
          "surface_hallucination_rate": {
            "trademark": 0.04411764705882353,
            "domain": 0.4642857142857143,
            "handle": 0.0,
            "cultural": 0.03333333333333333,
            "sound": 0.25
          },
          "surface_detail": {
            "trademark": {
              "n": 90,
              "n_confident_assertions": 68,
              "n_hedged": 22,
              "n_unparseable": 0,
              "hallucination_rate": 0.04411764705882353,
              "accuracy_among_assertions": 0.9558823529411765,
              "hedge_rate": 0.24444444444444444,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 90,
              "n_confident_assertions": 56,
              "n_hedged": 4,
              "n_unparseable": 30,
              "hallucination_rate": 0.4642857142857143,
              "accuracy_among_assertions": 0.5357142857142857,
              "hedge_rate": 0.044444444444444446,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 90,
              "n_confident_assertions": 11,
              "n_hedged": 49,
              "n_unparseable": 30,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.5444444444444444,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 90,
              "n_confident_assertions": 60,
              "n_hedged": 0,
              "n_unparseable": 30,
              "hallucination_rate": 0.03333333333333333,
              "accuracy_among_assertions": 0.9666666666666667,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 90,
              "n_confident_assertions": 60,
              "n_hedged": 0,
              "n_unparseable": 30,
              "hallucination_rate": 0.25,
              "accuracy_among_assertions": 0.75,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        }
      },
      "by_prompt_version": {
        "v1_naive": {
          "n": 500,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 500,
          "accuracy": 0.698,
          "accuracy_among_scored": 0.698,
          "false_negative_rate": 0.0,
          "false_positive_rate": 0.9801324503311258,
          "hedge_rate": 0.004,
          "unparseable_rate": 0.002,
          "hallucination_rate": 0.03,
          "confidence_brier_score": 0.49133874345549733,
          "counts": {
            "correct": 349,
            "false_negative": 0,
            "false_positive": 148,
            "hedge": 2,
            "unparseable": 1,
            "hallucinated": 15,
            "excluded": 0,
            "n_risky_truth": 349,
            "n_safe_truth": 151
          },
          "surface_hallucination_rate": {
            "trademark": 0.2977867203219316,
            "domain": null,
            "handle": null,
            "cultural": null,
            "sound": null
          },
          "surface_detail": {
            "trademark": {
              "n": 500,
              "n_confident_assertions": 497,
              "n_hedged": 2,
              "n_unparseable": 1,
              "hallucination_rate": 0.2977867203219316,
              "accuracy_among_assertions": 0.7022132796780685,
              "hedge_rate": 0.004,
              "unparseable_rate": 0.002
            },
            "domain": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            },
            "handle": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            },
            "cultural": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            },
            "sound": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            }
          }
        },
        "v2_constrained": {
          "n": 500,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 500,
          "accuracy": 0.572,
          "accuracy_among_scored": 0.572,
          "false_negative_rate": 0.017191977077363897,
          "false_positive_rate": 0.0,
          "hedge_rate": 0.416,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.028,
          "confidence_brier_score": 0.056516438356164385,
          "counts": {
            "correct": 286,
            "false_negative": 6,
            "false_positive": 0,
            "hedge": 208,
            "unparseable": 0,
            "hallucinated": 14,
            "excluded": 0,
            "n_risky_truth": 349,
            "n_safe_truth": 151
          },
          "surface_hallucination_rate": {
            "trademark": 0.02054794520547945,
            "domain": 0.10429447852760736,
            "handle": 0.0,
            "cultural": 0.08,
            "sound": 0.304
          },
          "surface_detail": {
            "trademark": {
              "n": 500,
              "n_confident_assertions": 292,
              "n_hedged": 208,
              "n_unparseable": 0,
              "hallucination_rate": 0.02054794520547945,
              "accuracy_among_assertions": 0.9794520547945206,
              "hedge_rate": 0.416,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 500,
              "n_confident_assertions": 326,
              "n_hedged": 174,
              "n_unparseable": 0,
              "hallucination_rate": 0.10429447852760736,
              "accuracy_among_assertions": 0.8957055214723927,
              "hedge_rate": 0.348,
              "unparseable_rate": 0.0
            },
            "handle": {
              "n": 500,
              "n_confident_assertions": 138,
              "n_hedged": 362,
              "n_unparseable": 0,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.724,
              "unparseable_rate": 0.0
            },
            "cultural": {
              "n": 500,
              "n_confident_assertions": 500,
              "n_hedged": 0,
              "n_unparseable": 0,
              "hallucination_rate": 0.08,
              "accuracy_among_assertions": 0.892,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.0
            },
            "sound": {
              "n": 500,
              "n_confident_assertions": 500,
              "n_hedged": 0,
              "n_unparseable": 0,
              "hallucination_rate": 0.304,
              "accuracy_among_assertions": 0.668,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.0
            }
          }
        },
        "v3_grounded": {
          "n": 500,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 500,
          "accuracy": 0.71,
          "accuracy_among_scored": 0.71,
          "false_negative_rate": 0.02865329512893983,
          "false_positive_rate": 0.033112582781456956,
          "hedge_rate": 0.26,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.176,
          "confidence_brier_score": 0.07613513513513515,
          "counts": {
            "correct": 355,
            "false_negative": 10,
            "false_positive": 5,
            "hedge": 130,
            "unparseable": 0,
            "hallucinated": 88,
            "excluded": 0,
            "n_risky_truth": 349,
            "n_safe_truth": 151
          },
          "surface_hallucination_rate": {
            "trademark": 0.04054054054054054,
            "domain": 0.13858695652173914,
            "handle": 0.0,
            "cultural": 0.074,
            "sound": 0.282
          },
          "surface_detail": {
            "trademark": {
              "n": 500,
              "n_confident_assertions": 370,
              "n_hedged": 130,
              "n_unparseable": 0,
              "hallucination_rate": 0.04054054054054054,
              "accuracy_among_assertions": 0.9594594594594594,
              "hedge_rate": 0.26,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 500,
              "n_confident_assertions": 368,
              "n_hedged": 132,
              "n_unparseable": 0,
              "hallucination_rate": 0.13858695652173914,
              "accuracy_among_assertions": 0.8614130434782609,
              "hedge_rate": 0.264,
              "unparseable_rate": 0.0
            },
            "handle": {
              "n": 500,
              "n_confident_assertions": 114,
              "n_hedged": 386,
              "n_unparseable": 0,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 1.0,
              "hedge_rate": 0.772,
              "unparseable_rate": 0.0
            },
            "cultural": {
              "n": 500,
              "n_confident_assertions": 500,
              "n_hedged": 0,
              "n_unparseable": 0,
              "hallucination_rate": 0.074,
              "accuracy_among_assertions": 0.898,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.0
            },
            "sound": {
              "n": 500,
              "n_confident_assertions": 500,
              "n_hedged": 0,
              "n_unparseable": 0,
              "hallucination_rate": 0.282,
              "accuracy_among_assertions": 0.69,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.0
            }
          }
        }
      }
    },
    "gemini-2.5-pro": {
      "n": 1500,
      "n_excluded_max_tokens_truncated": 0,
      "n_scored": 1500,
      "accuracy": 0.7433333333333333,
      "accuracy_among_scored": 0.7433333333333333,
      "false_negative_rate": 0.0659025787965616,
      "false_positive_rate": 0.47019867549668876,
      "hedge_rate": 0.017333333333333333,
      "unparseable_rate": 0.051333333333333335,
      "hallucination_rate": 0.432,
      "confidence_brier_score": 0.2753455762081784,
      "counts": {
        "correct": 1115,
        "false_negative": 69,
        "false_positive": 213,
        "hedge": 26,
        "unparseable": 77,
        "hallucinated": 648,
        "excluded": 0,
        "n_risky_truth": 1047,
        "n_safe_truth": 453
      },
      "surface_hallucination_rate": {
        "trademark": 0.20186113099498926,
        "domain": 0.21568627450980393,
        "handle": 0.09622641509433963,
        "cultural": 0.06616862326574173,
        "sound": 0.27801724137931033
      },
      "surface_detail": {
        "trademark": {
          "n": 1500,
          "n_confident_assertions": 1397,
          "n_hedged": 26,
          "n_unparseable": 77,
          "hallucination_rate": 0.20186113099498926,
          "accuracy_among_assertions": 0.7981388690050107,
          "hedge_rate": 0.017333333333333333,
          "unparseable_rate": 0.051333333333333335
        },
        "domain": {
          "n": 1500,
          "n_confident_assertions": 918,
          "n_hedged": 19,
          "n_unparseable": 563,
          "hallucination_rate": 0.21568627450980393,
          "accuracy_among_assertions": 0.7843137254901961,
          "hedge_rate": 0.012666666666666666,
          "unparseable_rate": 0.37533333333333335
        },
        "handle": {
          "n": 1500,
          "n_confident_assertions": 530,
          "n_hedged": 407,
          "n_unparseable": 563,
          "hallucination_rate": 0.09622641509433963,
          "accuracy_among_assertions": 0.8528301886792453,
          "hedge_rate": 0.2713333333333333,
          "unparseable_rate": 0.37533333333333335
        },
        "cultural": {
          "n": 1500,
          "n_confident_assertions": 937,
          "n_hedged": 0,
          "n_unparseable": 563,
          "hallucination_rate": 0.06616862326574173,
          "accuracy_among_assertions": 0.9050160085378869,
          "hedge_rate": 0.0,
          "unparseable_rate": 0.37533333333333335
        },
        "sound": {
          "n": 1500,
          "n_confident_assertions": 928,
          "n_hedged": 9,
          "n_unparseable": 563,
          "hallucination_rate": 0.27801724137931033,
          "accuracy_among_assertions": 0.6928879310344828,
          "hedge_rate": 0.006,
          "unparseable_rate": 0.37533333333333335
        }
      },
      "by_category": {
        "agency-firm": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.7666666666666667,
          "accuracy_among_scored": 0.7666666666666667,
          "false_negative_rate": 0.06666666666666667,
          "false_positive_rate": 0.4222222222222222,
          "hedge_rate": 0.006666666666666667,
          "unparseable_rate": 0.05333333333333334,
          "hallucination_rate": 0.3933333333333333,
          "confidence_brier_score": 0.273168345323741,
          "counts": {
            "correct": 115,
            "false_negative": 7,
            "false_positive": 19,
            "hedge": 1,
            "unparseable": 8,
            "hallucinated": 59,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.18439716312056736,
            "domain": 0.25,
            "handle": 0.05,
            "cultural": 0.0425531914893617,
            "sound": 0.3804347826086957
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 141,
              "n_hedged": 1,
              "n_unparseable": 8,
              "hallucination_rate": 0.18439716312056736,
              "accuracy_among_assertions": 0.8156028368794326,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.05333333333333334
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 92,
              "n_hedged": 2,
              "n_unparseable": 56,
              "hallucination_rate": 0.25,
              "accuracy_among_assertions": 0.75,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.37333333333333335
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 60,
              "n_hedged": 34,
              "n_unparseable": 56,
              "hallucination_rate": 0.05,
              "accuracy_among_assertions": 0.7333333333333333,
              "hedge_rate": 0.22666666666666666,
              "unparseable_rate": 0.37333333333333335
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 94,
              "n_hedged": 0,
              "n_unparseable": 56,
              "hallucination_rate": 0.0425531914893617,
              "accuracy_among_assertions": 0.8191489361702128,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.37333333333333335
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 92,
              "n_hedged": 2,
              "n_unparseable": 56,
              "hallucination_rate": 0.3804347826086957,
              "accuracy_among_assertions": 0.4782608695652174,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.37333333333333335
            }
          }
        },
        "ai-agent": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.7133333333333334,
          "accuracy_among_scored": 0.7133333333333334,
          "false_negative_rate": 0.06862745098039216,
          "false_positive_rate": 0.5625,
          "hedge_rate": 0.013333333333333334,
          "unparseable_rate": 0.04666666666666667,
          "hallucination_rate": 0.4666666666666667,
          "confidence_brier_score": 0.3041768656716418,
          "counts": {
            "correct": 107,
            "false_negative": 7,
            "false_positive": 27,
            "hedge": 2,
            "unparseable": 7,
            "hallucinated": 70,
            "excluded": 0,
            "n_risky_truth": 102,
            "n_safe_truth": 48
          },
          "surface_hallucination_rate": {
            "trademark": 0.24113475177304963,
            "domain": 0.24731182795698925,
            "handle": 0.0625,
            "cultural": 0.0425531914893617,
            "sound": 0.2857142857142857
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 141,
              "n_hedged": 2,
              "n_unparseable": 7,
              "hallucination_rate": 0.24113475177304963,
              "accuracy_among_assertions": 0.7588652482269503,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.04666666666666667
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 93,
              "n_hedged": 1,
              "n_unparseable": 56,
              "hallucination_rate": 0.24731182795698925,
              "accuracy_among_assertions": 0.7526881720430108,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.37333333333333335
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 48,
              "n_hedged": 46,
              "n_unparseable": 56,
              "hallucination_rate": 0.0625,
              "accuracy_among_assertions": 0.9375,
              "hedge_rate": 0.30666666666666664,
              "unparseable_rate": 0.37333333333333335
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 94,
              "n_hedged": 0,
              "n_unparseable": 56,
              "hallucination_rate": 0.0425531914893617,
              "accuracy_among_assertions": 0.9574468085106383,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.37333333333333335
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 91,
              "n_hedged": 3,
              "n_unparseable": 56,
              "hallucination_rate": 0.2857142857142857,
              "accuracy_among_assertions": 0.7142857142857143,
              "hedge_rate": 0.02,
              "unparseable_rate": 0.37333333333333335
            }
          }
        },
        "biotech-pharma": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.7666666666666667,
          "accuracy_among_scored": 0.7666666666666667,
          "false_negative_rate": 0.0761904761904762,
          "false_positive_rate": 0.4666666666666667,
          "hedge_rate": 0.006666666666666667,
          "unparseable_rate": 0.03333333333333333,
          "hallucination_rate": 0.5133333333333333,
          "confidence_brier_score": 0.27716760563380277,
          "counts": {
            "correct": 115,
            "false_negative": 8,
            "false_positive": 21,
            "hedge": 1,
            "unparseable": 5,
            "hallucinated": 77,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.2013888888888889,
            "domain": 0.23404255319148937,
            "handle": 0.14,
            "cultural": 0.020833333333333332,
            "sound": 0.25
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 144,
              "n_hedged": 1,
              "n_unparseable": 5,
              "hallucination_rate": 0.2013888888888889,
              "accuracy_among_assertions": 0.7986111111111112,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.03333333333333333
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 94,
              "n_hedged": 2,
              "n_unparseable": 54,
              "hallucination_rate": 0.23404255319148937,
              "accuracy_among_assertions": 0.7659574468085106,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.36
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 50,
              "n_hedged": 46,
              "n_unparseable": 54,
              "hallucination_rate": 0.14,
              "accuracy_among_assertions": 0.86,
              "hedge_rate": 0.30666666666666664,
              "unparseable_rate": 0.36
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 96,
              "n_hedged": 0,
              "n_unparseable": 54,
              "hallucination_rate": 0.020833333333333332,
              "accuracy_among_assertions": 0.9791666666666666,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.36
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 96,
              "n_hedged": 0,
              "n_unparseable": 54,
              "hallucination_rate": 0.25,
              "accuracy_among_assertions": 0.75,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.36
            }
          }
        },
        "dev-tools": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.7,
          "accuracy_among_scored": 0.7,
          "false_negative_rate": 0.08571428571428572,
          "false_positive_rate": 0.4666666666666667,
          "hedge_rate": 0.03333333333333333,
          "unparseable_rate": 0.06666666666666667,
          "hallucination_rate": 0.36666666666666664,
          "confidence_brier_score": 0.30452999999999997,
          "counts": {
            "correct": 105,
            "false_negative": 9,
            "false_positive": 21,
            "hedge": 5,
            "unparseable": 10,
            "hallucinated": 55,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.2222222222222222,
            "domain": 0.16853932584269662,
            "handle": 0.09090909090909091,
            "cultural": 0.12087912087912088,
            "sound": 0.27472527472527475
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 135,
              "n_hedged": 5,
              "n_unparseable": 10,
              "hallucination_rate": 0.2222222222222222,
              "accuracy_among_assertions": 0.7777777777777778,
              "hedge_rate": 0.03333333333333333,
              "unparseable_rate": 0.06666666666666667
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 89,
              "n_hedged": 2,
              "n_unparseable": 59,
              "hallucination_rate": 0.16853932584269662,
              "accuracy_among_assertions": 0.8314606741573034,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.3933333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 55,
              "n_hedged": 36,
              "n_unparseable": 59,
              "hallucination_rate": 0.09090909090909091,
              "accuracy_among_assertions": 0.9090909090909091,
              "hedge_rate": 0.24,
              "unparseable_rate": 0.3933333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 91,
              "n_hedged": 0,
              "n_unparseable": 59,
              "hallucination_rate": 0.12087912087912088,
              "accuracy_among_assertions": 0.8791208791208791,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3933333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 91,
              "n_hedged": 0,
              "n_unparseable": 59,
              "hallucination_rate": 0.27472527472527475,
              "accuracy_among_assertions": 0.7252747252747253,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3933333333333333
            }
          }
        },
        "dtc-consumer": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.72,
          "accuracy_among_scored": 0.72,
          "false_negative_rate": 0.12380952380952381,
          "false_positive_rate": 0.5555555555555556,
          "hedge_rate": 0.006666666666666667,
          "unparseable_rate": 0.02,
          "hallucination_rate": 0.5,
          "confidence_brier_score": 0.3006836879432624,
          "counts": {
            "correct": 108,
            "false_negative": 13,
            "false_positive": 25,
            "hedge": 1,
            "unparseable": 3,
            "hallucinated": 75,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.2602739726027397,
            "domain": 0.25252525252525254,
            "handle": 0.19230769230769232,
            "cultural": 0.06060606060606061,
            "sound": 0.18181818181818182
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 146,
              "n_hedged": 1,
              "n_unparseable": 3,
              "hallucination_rate": 0.2602739726027397,
              "accuracy_among_assertions": 0.7397260273972602,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.02
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 99,
              "n_hedged": 0,
              "n_unparseable": 51,
              "hallucination_rate": 0.25252525252525254,
              "accuracy_among_assertions": 0.7474747474747475,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.34
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 52,
              "n_hedged": 47,
              "n_unparseable": 51,
              "hallucination_rate": 0.19230769230769232,
              "accuracy_among_assertions": 0.7307692307692307,
              "hedge_rate": 0.31333333333333335,
              "unparseable_rate": 0.34
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 99,
              "n_hedged": 0,
              "n_unparseable": 51,
              "hallucination_rate": 0.06060606060606061,
              "accuracy_among_assertions": 0.898989898989899,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.34
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 99,
              "n_hedged": 0,
              "n_unparseable": 51,
              "hallucination_rate": 0.18181818181818182,
              "accuracy_among_assertions": 0.7777777777777778,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.34
            }
          }
        },
        "fintech": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.8133333333333334,
          "accuracy_among_scored": 0.8133333333333334,
          "false_negative_rate": 0.009523809523809525,
          "false_positive_rate": 0.37777777777777777,
          "hedge_rate": 0.02,
          "unparseable_rate": 0.04666666666666667,
          "hallucination_rate": 0.38,
          "confidence_brier_score": 0.20672255639097745,
          "counts": {
            "correct": 122,
            "false_negative": 1,
            "false_positive": 17,
            "hedge": 3,
            "unparseable": 7,
            "hallucinated": 57,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.12857142857142856,
            "domain": 0.13978494623655913,
            "handle": 0.09615384615384616,
            "cultural": 0.08333333333333333,
            "sound": 0.2
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 140,
              "n_hedged": 3,
              "n_unparseable": 7,
              "hallucination_rate": 0.12857142857142856,
              "accuracy_among_assertions": 0.8714285714285714,
              "hedge_rate": 0.02,
              "unparseable_rate": 0.04666666666666667
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 93,
              "n_hedged": 3,
              "n_unparseable": 54,
              "hallucination_rate": 0.13978494623655913,
              "accuracy_among_assertions": 0.8602150537634409,
              "hedge_rate": 0.02,
              "unparseable_rate": 0.36
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 52,
              "n_hedged": 44,
              "n_unparseable": 54,
              "hallucination_rate": 0.09615384615384616,
              "accuracy_among_assertions": 0.9038461538461539,
              "hedge_rate": 0.29333333333333333,
              "unparseable_rate": 0.36
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 96,
              "n_hedged": 0,
              "n_unparseable": 54,
              "hallucination_rate": 0.08333333333333333,
              "accuracy_among_assertions": 0.9166666666666666,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.36
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 95,
              "n_hedged": 1,
              "n_unparseable": 54,
              "hallucination_rate": 0.2,
              "accuracy_among_assertions": 0.8,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.36
            }
          }
        },
        "gaming": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.7733333333333333,
          "accuracy_among_scored": 0.7733333333333333,
          "false_negative_rate": 0.0380952380952381,
          "false_positive_rate": 0.4,
          "hedge_rate": 0.0,
          "unparseable_rate": 0.08,
          "hallucination_rate": 0.3933333333333333,
          "confidence_brier_score": 0.23725193798449612,
          "counts": {
            "correct": 116,
            "false_negative": 4,
            "false_positive": 18,
            "hedge": 0,
            "unparseable": 12,
            "hallucinated": 59,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.15942028985507245,
            "domain": 0.26666666666666666,
            "handle": 0.09090909090909091,
            "cultural": 0.07692307692307693,
            "sound": 0.2696629213483146
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 138,
              "n_hedged": 0,
              "n_unparseable": 12,
              "hallucination_rate": 0.15942028985507245,
              "accuracy_among_assertions": 0.8405797101449275,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.08
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 90,
              "n_hedged": 1,
              "n_unparseable": 59,
              "hallucination_rate": 0.26666666666666666,
              "accuracy_among_assertions": 0.7333333333333333,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.3933333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 55,
              "n_hedged": 36,
              "n_unparseable": 59,
              "hallucination_rate": 0.09090909090909091,
              "accuracy_among_assertions": 0.9090909090909091,
              "hedge_rate": 0.24,
              "unparseable_rate": 0.3933333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 91,
              "n_hedged": 0,
              "n_unparseable": 59,
              "hallucination_rate": 0.07692307692307693,
              "accuracy_among_assertions": 0.9230769230769231,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3933333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 89,
              "n_hedged": 2,
              "n_unparseable": 59,
              "hallucination_rate": 0.2696629213483146,
              "accuracy_among_assertions": 0.7303370786516854,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.3933333333333333
            }
          }
        },
        "indie-maker": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.68,
          "accuracy_among_scored": 0.68,
          "false_negative_rate": 0.05714285714285714,
          "false_positive_rate": 0.4888888888888889,
          "hedge_rate": 0.05333333333333334,
          "unparseable_rate": 0.08,
          "hallucination_rate": 0.46,
          "confidence_brier_score": 0.3179024193548387,
          "counts": {
            "correct": 102,
            "false_negative": 6,
            "false_positive": 22,
            "hedge": 8,
            "unparseable": 12,
            "hallucinated": 69,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.2153846153846154,
            "domain": 0.18072289156626506,
            "handle": 0.0851063829787234,
            "cultural": 0.045454545454545456,
            "sound": 0.3977272727272727
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 130,
              "n_hedged": 8,
              "n_unparseable": 12,
              "hallucination_rate": 0.2153846153846154,
              "accuracy_among_assertions": 0.7846153846153846,
              "hedge_rate": 0.05333333333333334,
              "unparseable_rate": 0.08
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 83,
              "n_hedged": 5,
              "n_unparseable": 62,
              "hallucination_rate": 0.18072289156626506,
              "accuracy_among_assertions": 0.8192771084337349,
              "hedge_rate": 0.03333333333333333,
              "unparseable_rate": 0.41333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 47,
              "n_hedged": 41,
              "n_unparseable": 62,
              "hallucination_rate": 0.0851063829787234,
              "accuracy_among_assertions": 0.9148936170212766,
              "hedge_rate": 0.2733333333333333,
              "unparseable_rate": 0.41333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 88,
              "n_hedged": 0,
              "n_unparseable": 62,
              "hallucination_rate": 0.045454545454545456,
              "accuracy_among_assertions": 0.9545454545454546,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.41333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 88,
              "n_hedged": 0,
              "n_unparseable": 62,
              "hallucination_rate": 0.3977272727272727,
              "accuracy_among_assertions": 0.6022727272727273,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.41333333333333333
            }
          }
        },
        "restaurant-food": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.76,
          "accuracy_among_scored": 0.76,
          "false_negative_rate": 0.08571428571428572,
          "false_positive_rate": 0.4,
          "hedge_rate": 0.006666666666666667,
          "unparseable_rate": 0.05333333333333334,
          "hallucination_rate": 0.4066666666666667,
          "confidence_brier_score": 0.2744970802919708,
          "counts": {
            "correct": 114,
            "false_negative": 9,
            "false_positive": 18,
            "hedge": 1,
            "unparseable": 8,
            "hallucinated": 61,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.19148936170212766,
            "domain": 0.17391304347826086,
            "handle": 0.05454545454545454,
            "cultural": 0.07526881720430108,
            "sound": 0.3333333333333333
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 141,
              "n_hedged": 1,
              "n_unparseable": 8,
              "hallucination_rate": 0.19148936170212766,
              "accuracy_among_assertions": 0.8085106382978723,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.05333333333333334
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 92,
              "n_hedged": 1,
              "n_unparseable": 57,
              "hallucination_rate": 0.17391304347826086,
              "accuracy_among_assertions": 0.8260869565217391,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.38
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 55,
              "n_hedged": 38,
              "n_unparseable": 57,
              "hallucination_rate": 0.05454545454545454,
              "accuracy_among_assertions": 0.7636363636363637,
              "hedge_rate": 0.25333333333333335,
              "unparseable_rate": 0.38
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 93,
              "n_hedged": 0,
              "n_unparseable": 57,
              "hallucination_rate": 0.07526881720430108,
              "accuracy_among_assertions": 0.8172043010752689,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.38
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 93,
              "n_hedged": 0,
              "n_unparseable": 57,
              "hallucination_rate": 0.3333333333333333,
              "accuracy_among_assertions": 0.5591397849462365,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.38
            }
          }
        },
        "saas-b2b": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.74,
          "accuracy_among_scored": 0.74,
          "false_negative_rate": 0.047619047619047616,
          "false_positive_rate": 0.5555555555555556,
          "hedge_rate": 0.02666666666666667,
          "unparseable_rate": 0.03333333333333333,
          "hallucination_rate": 0.44,
          "confidence_brier_score": 0.25838970588235294,
          "counts": {
            "correct": 111,
            "false_negative": 5,
            "false_positive": 25,
            "hedge": 4,
            "unparseable": 5,
            "hallucinated": 66,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.2127659574468085,
            "domain": 0.23655913978494625,
            "handle": 0.10714285714285714,
            "cultural": 0.09473684210526316,
            "sound": 0.22340425531914893
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 141,
              "n_hedged": 4,
              "n_unparseable": 5,
              "hallucination_rate": 0.2127659574468085,
              "accuracy_among_assertions": 0.7872340425531915,
              "hedge_rate": 0.02666666666666667,
              "unparseable_rate": 0.03333333333333333
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 93,
              "n_hedged": 2,
              "n_unparseable": 55,
              "hallucination_rate": 0.23655913978494625,
              "accuracy_among_assertions": 0.7634408602150538,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.36666666666666664
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 56,
              "n_hedged": 39,
              "n_unparseable": 55,
              "hallucination_rate": 0.10714285714285714,
              "accuracy_among_assertions": 0.8928571428571429,
              "hedge_rate": 0.26,
              "unparseable_rate": 0.36666666666666664
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 95,
              "n_hedged": 0,
              "n_unparseable": 55,
              "hallucination_rate": 0.09473684210526316,
              "accuracy_among_assertions": 0.9052631578947369,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.36666666666666664
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 94,
              "n_hedged": 1,
              "n_unparseable": 55,
              "hallucination_rate": 0.22340425531914893,
              "accuracy_among_assertions": 0.776595744680851,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.36666666666666664
            }
          }
        }
      },
      "by_difficulty": {
        "easy": {
          "n": 900,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 900,
          "accuracy": 0.6766666666666666,
          "accuracy_among_scored": 0.6766666666666666,
          "false_negative_rate": 0.06666666666666667,
          "false_positive_rate": 0.4666666666666667,
          "hedge_rate": 0.021111111111111112,
          "unparseable_rate": 0.035555555555555556,
          "hallucination_rate": 0.39666666666666667,
          "confidence_brier_score": 0.23650464547677263,
          "counts": {
            "correct": 609,
            "false_negative": 30,
            "false_positive": 210,
            "hedge": 19,
            "unparseable": 32,
            "hallucinated": 357,
            "excluded": 0,
            "n_risky_truth": 450,
            "n_safe_truth": 450
          },
          "surface_hallucination_rate": {
            "trademark": 0.2826855123674912,
            "domain": 0.1746031746031746,
            "handle": 0.11834319526627218,
            "cultural": 0.05163511187607573,
            "sound": 0.32642487046632124
          },
          "surface_detail": {
            "trademark": {
              "n": 900,
              "n_confident_assertions": 849,
              "n_hedged": 19,
              "n_unparseable": 32,
              "hallucination_rate": 0.2826855123674912,
              "accuracy_among_assertions": 0.7173144876325088,
              "hedge_rate": 0.021111111111111112,
              "unparseable_rate": 0.035555555555555556
            },
            "domain": {
              "n": 900,
              "n_confident_assertions": 567,
              "n_hedged": 14,
              "n_unparseable": 319,
              "hallucination_rate": 0.1746031746031746,
              "accuracy_among_assertions": 0.8253968253968254,
              "hedge_rate": 0.015555555555555555,
              "unparseable_rate": 0.35444444444444445
            },
            "handle": {
              "n": 900,
              "n_confident_assertions": 338,
              "n_hedged": 243,
              "n_unparseable": 319,
              "hallucination_rate": 0.11834319526627218,
              "accuracy_among_assertions": 0.8076923076923077,
              "hedge_rate": 0.27,
              "unparseable_rate": 0.35444444444444445
            },
            "cultural": {
              "n": 900,
              "n_confident_assertions": 581,
              "n_hedged": 0,
              "n_unparseable": 319,
              "hallucination_rate": 0.05163511187607573,
              "accuracy_among_assertions": 0.9053356282271945,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.35444444444444445
            },
            "sound": {
              "n": 900,
              "n_confident_assertions": 579,
              "n_hedged": 2,
              "n_unparseable": 319,
              "hallucination_rate": 0.32642487046632124,
              "accuracy_among_assertions": 0.6303972366148531,
              "hedge_rate": 0.0022222222222222222,
              "unparseable_rate": 0.35444444444444445
            }
          }
        },
        "hard": {
          "n": 510,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 510,
          "accuracy": 0.8529411764705882,
          "accuracy_among_scored": 0.8529411764705882,
          "false_negative_rate": 0.06666666666666667,
          "false_positive_rate": null,
          "hedge_rate": 0.011764705882352941,
          "unparseable_rate": 0.06862745098039216,
          "hallucination_rate": 0.46862745098039216,
          "confidence_brier_score": 0.33305995575221237,
          "counts": {
            "correct": 435,
            "false_negative": 34,
            "false_positive": 0,
            "hedge": 6,
            "unparseable": 35,
            "hallucinated": 239,
            "excluded": 0,
            "n_risky_truth": 510,
            "n_safe_truth": 0
          },
          "surface_hallucination_rate": {
            "trademark": 0.07249466950959488,
            "domain": 0.2425249169435216,
            "handle": 0.054878048780487805,
            "cultural": 0.08823529411764706,
            "sound": 0.18666666666666668
          },
          "surface_detail": {
            "trademark": {
              "n": 510,
              "n_confident_assertions": 469,
              "n_hedged": 6,
              "n_unparseable": 35,
              "hallucination_rate": 0.07249466950959488,
              "accuracy_among_assertions": 0.9275053304904051,
              "hedge_rate": 0.011764705882352941,
              "unparseable_rate": 0.06862745098039216
            },
            "domain": {
              "n": 510,
              "n_confident_assertions": 301,
              "n_hedged": 5,
              "n_unparseable": 204,
              "hallucination_rate": 0.2425249169435216,
              "accuracy_among_assertions": 0.7574750830564784,
              "hedge_rate": 0.00980392156862745,
              "unparseable_rate": 0.4
            },
            "handle": {
              "n": 510,
              "n_confident_assertions": 164,
              "n_hedged": 142,
              "n_unparseable": 204,
              "hallucination_rate": 0.054878048780487805,
              "accuracy_among_assertions": 0.9329268292682927,
              "hedge_rate": 0.2784313725490196,
              "unparseable_rate": 0.4
            },
            "cultural": {
              "n": 510,
              "n_confident_assertions": 306,
              "n_hedged": 0,
              "n_unparseable": 204,
              "hallucination_rate": 0.08823529411764706,
              "accuracy_among_assertions": 0.9052287581699346,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.4
            },
            "sound": {
              "n": 510,
              "n_confident_assertions": 300,
              "n_hedged": 6,
              "n_unparseable": 204,
              "hallucination_rate": 0.18666666666666668,
              "accuracy_among_assertions": 0.8066666666666666,
              "hedge_rate": 0.011764705882352941,
              "unparseable_rate": 0.4
            }
          }
        },
        "medium": {
          "n": 90,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 90,
          "accuracy": 0.7888888888888889,
          "accuracy_among_scored": 0.7888888888888889,
          "false_negative_rate": 0.05747126436781609,
          "false_positive_rate": 1.0,
          "hedge_rate": 0.011111111111111112,
          "unparseable_rate": 0.1111111111111111,
          "hallucination_rate": 0.5777777777777777,
          "confidence_brier_score": 0.3511453333333333,
          "counts": {
            "correct": 71,
            "false_negative": 5,
            "false_positive": 3,
            "hedge": 1,
            "unparseable": 10,
            "hallucinated": 52,
            "excluded": 0,
            "n_risky_truth": 87,
            "n_safe_truth": 3
          },
          "surface_hallucination_rate": {
            "trademark": 0.10126582278481013,
            "domain": 0.52,
            "handle": 0.07142857142857142,
            "cultural": 0.1,
            "sound": 0.2653061224489796
          },
          "surface_detail": {
            "trademark": {
              "n": 90,
              "n_confident_assertions": 79,
              "n_hedged": 1,
              "n_unparseable": 10,
              "hallucination_rate": 0.10126582278481013,
              "accuracy_among_assertions": 0.8987341772151899,
              "hedge_rate": 0.011111111111111112,
              "unparseable_rate": 0.1111111111111111
            },
            "domain": {
              "n": 90,
              "n_confident_assertions": 50,
              "n_hedged": 0,
              "n_unparseable": 40,
              "hallucination_rate": 0.52,
              "accuracy_among_assertions": 0.48,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.4444444444444444
            },
            "handle": {
              "n": 90,
              "n_confident_assertions": 28,
              "n_hedged": 22,
              "n_unparseable": 40,
              "hallucination_rate": 0.07142857142857142,
              "accuracy_among_assertions": 0.9285714285714286,
              "hedge_rate": 0.24444444444444444,
              "unparseable_rate": 0.4444444444444444
            },
            "cultural": {
              "n": 90,
              "n_confident_assertions": 50,
              "n_hedged": 0,
              "n_unparseable": 40,
              "hallucination_rate": 0.1,
              "accuracy_among_assertions": 0.9,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.4444444444444444
            },
            "sound": {
              "n": 90,
              "n_confident_assertions": 49,
              "n_hedged": 1,
              "n_unparseable": 40,
              "hallucination_rate": 0.2653061224489796,
              "accuracy_among_assertions": 0.7346938775510204,
              "hedge_rate": 0.011111111111111112,
              "unparseable_rate": 0.4444444444444444
            }
          }
        }
      },
      "by_prompt_version": {
        "v1_naive": {
          "n": 500,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 500,
          "accuracy": 0.698,
          "accuracy_among_scored": 0.698,
          "false_negative_rate": 0.0,
          "false_positive_rate": 0.9072847682119205,
          "hedge_rate": 0.0,
          "unparseable_rate": 0.028,
          "hallucination_rate": 0.222,
          "confidence_brier_score": 0.5335815668202765,
          "counts": {
            "correct": 349,
            "false_negative": 0,
            "false_positive": 137,
            "hedge": 0,
            "unparseable": 14,
            "hallucinated": 111,
            "excluded": 0,
            "n_risky_truth": 349,
            "n_safe_truth": 151
          },
          "surface_hallucination_rate": {
            "trademark": 0.28189300411522633,
            "domain": null,
            "handle": null,
            "cultural": null,
            "sound": null
          },
          "surface_detail": {
            "trademark": {
              "n": 500,
              "n_confident_assertions": 486,
              "n_hedged": 0,
              "n_unparseable": 14,
              "hallucination_rate": 0.28189300411522633,
              "accuracy_among_assertions": 0.7181069958847737,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.028
            },
            "domain": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            },
            "handle": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            },
            "cultural": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            },
            "sound": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            }
          }
        },
        "v2_constrained": {
          "n": 500,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 500,
          "accuracy": 0.764,
          "accuracy_among_scored": 0.764,
          "false_negative_rate": 0.07449856733524356,
          "false_positive_rate": 0.423841059602649,
          "hedge_rate": 0.002,
          "unparseable_rate": 0.054,
          "hallucination_rate": 0.722,
          "confidence_brier_score": 0.1689936440677966,
          "counts": {
            "correct": 382,
            "false_negative": 26,
            "false_positive": 64,
            "hedge": 1,
            "unparseable": 27,
            "hallucinated": 361,
            "excluded": 0,
            "n_risky_truth": 349,
            "n_safe_truth": 151
          },
          "surface_hallucination_rate": {
            "trademark": 0.1906779661016949,
            "domain": 0.2854122621564482,
            "handle": 0.10782241014799154,
            "cultural": 0.13107822410147993,
            "sound": 0.321353065539112
          },
          "surface_detail": {
            "trademark": {
              "n": 500,
              "n_confident_assertions": 472,
              "n_hedged": 1,
              "n_unparseable": 27,
              "hallucination_rate": 0.1906779661016949,
              "accuracy_among_assertions": 0.809322033898305,
              "hedge_rate": 0.002,
              "unparseable_rate": 0.054
            },
            "domain": {
              "n": 500,
              "n_confident_assertions": 473,
              "n_hedged": 0,
              "n_unparseable": 27,
              "hallucination_rate": 0.2854122621564482,
              "accuracy_among_assertions": 0.7145877378435518,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.054
            },
            "handle": {
              "n": 500,
              "n_confident_assertions": 473,
              "n_hedged": 0,
              "n_unparseable": 27,
              "hallucination_rate": 0.10782241014799154,
              "accuracy_among_assertions": 0.8625792811839323,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.054
            },
            "cultural": {
              "n": 500,
              "n_confident_assertions": 473,
              "n_hedged": 0,
              "n_unparseable": 27,
              "hallucination_rate": 0.13107822410147993,
              "accuracy_among_assertions": 0.8393234672304439,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.054
            },
            "sound": {
              "n": 500,
              "n_confident_assertions": 473,
              "n_hedged": 0,
              "n_unparseable": 27,
              "hallucination_rate": 0.321353065539112,
              "accuracy_among_assertions": 0.6490486257928119,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.054
            }
          }
        },
        "v3_grounded": {
          "n": 500,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 500,
          "accuracy": 0.768,
          "accuracy_among_scored": 0.768,
          "false_negative_rate": 0.12320916905444126,
          "false_positive_rate": 0.07947019867549669,
          "hedge_rate": 0.05,
          "unparseable_rate": 0.072,
          "hallucination_rate": 0.352,
          "confidence_brier_score": 0.13439726651480638,
          "counts": {
            "correct": 384,
            "false_negative": 43,
            "false_positive": 12,
            "hedge": 25,
            "unparseable": 36,
            "hallucinated": 176,
            "excluded": 0,
            "n_risky_truth": 349,
            "n_safe_truth": 151
          },
          "surface_hallucination_rate": {
            "trademark": 0.1252847380410023,
            "domain": 0.14157303370786517,
            "handle": 0.0,
            "cultural": 0.0,
            "sound": 0.23296703296703297
          },
          "surface_detail": {
            "trademark": {
              "n": 500,
              "n_confident_assertions": 439,
              "n_hedged": 25,
              "n_unparseable": 36,
              "hallucination_rate": 0.1252847380410023,
              "accuracy_among_assertions": 0.8747152619589977,
              "hedge_rate": 0.05,
              "unparseable_rate": 0.072
            },
            "domain": {
              "n": 500,
              "n_confident_assertions": 445,
              "n_hedged": 19,
              "n_unparseable": 36,
              "hallucination_rate": 0.14157303370786517,
              "accuracy_among_assertions": 0.8584269662921349,
              "hedge_rate": 0.038,
              "unparseable_rate": 0.072
            },
            "handle": {
              "n": 500,
              "n_confident_assertions": 57,
              "n_hedged": 407,
              "n_unparseable": 36,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 0.7719298245614035,
              "hedge_rate": 0.814,
              "unparseable_rate": 0.072
            },
            "cultural": {
              "n": 500,
              "n_confident_assertions": 464,
              "n_hedged": 0,
              "n_unparseable": 36,
              "hallucination_rate": 0.0,
              "accuracy_among_assertions": 0.9719827586206896,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.072
            },
            "sound": {
              "n": 500,
              "n_confident_assertions": 455,
              "n_hedged": 9,
              "n_unparseable": 36,
              "hallucination_rate": 0.23296703296703297,
              "accuracy_among_assertions": 0.7384615384615385,
              "hedge_rate": 0.018,
              "unparseable_rate": 0.072
            }
          }
        }
      }
    },
    "gpt-4o-mini": {
      "n": 1500,
      "n_excluded_max_tokens_truncated": 0,
      "n_scored": 1500,
      "accuracy": 0.634,
      "accuracy_among_scored": 0.634,
      "false_negative_rate": 0.19484240687679083,
      "false_positive_rate": 0.4503311258278146,
      "hedge_rate": 0.09266666666666666,
      "unparseable_rate": 0.0013333333333333333,
      "hallucination_rate": 0.176,
      "confidence_brier_score": 0.2884150047483381,
      "counts": {
        "correct": 951,
        "false_negative": 204,
        "false_positive": 204,
        "hedge": 139,
        "unparseable": 2,
        "hallucinated": 264,
        "excluded": 0,
        "n_risky_truth": 1047,
        "n_safe_truth": 453
      },
      "surface_hallucination_rate": {
        "trademark": 0.30022075055187636,
        "domain": 0.3479591836734694,
        "handle": 0.1568627450980392,
        "cultural": 0.063,
        "sound": 0.266
      },
      "surface_detail": {
        "trademark": {
          "n": 1500,
          "n_confident_assertions": 1359,
          "n_hedged": 139,
          "n_unparseable": 2,
          "hallucination_rate": 0.30022075055187636,
          "accuracy_among_assertions": 0.6997792494481236,
          "hedge_rate": 0.09266666666666666,
          "unparseable_rate": 0.0013333333333333333
        },
        "domain": {
          "n": 1500,
          "n_confident_assertions": 980,
          "n_hedged": 20,
          "n_unparseable": 500,
          "hallucination_rate": 0.3479591836734694,
          "accuracy_among_assertions": 0.6520408163265307,
          "hedge_rate": 0.013333333333333334,
          "unparseable_rate": 0.3333333333333333
        },
        "handle": {
          "n": 1500,
          "n_confident_assertions": 510,
          "n_hedged": 490,
          "n_unparseable": 500,
          "hallucination_rate": 0.1568627450980392,
          "accuracy_among_assertions": 0.796078431372549,
          "hedge_rate": 0.32666666666666666,
          "unparseable_rate": 0.3333333333333333
        },
        "cultural": {
          "n": 1500,
          "n_confident_assertions": 1000,
          "n_hedged": 0,
          "n_unparseable": 500,
          "hallucination_rate": 0.063,
          "accuracy_among_assertions": 0.909,
          "hedge_rate": 0.0,
          "unparseable_rate": 0.3333333333333333
        },
        "sound": {
          "n": 1500,
          "n_confident_assertions": 1000,
          "n_hedged": 0,
          "n_unparseable": 500,
          "hallucination_rate": 0.266,
          "accuracy_among_assertions": 0.706,
          "hedge_rate": 0.0,
          "unparseable_rate": 0.3333333333333333
        }
      },
      "by_category": {
        "agency-firm": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.64,
          "accuracy_among_scored": 0.64,
          "false_negative_rate": 0.22857142857142856,
          "false_positive_rate": 0.35555555555555557,
          "hedge_rate": 0.09333333333333334,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.16666666666666666,
          "confidence_brier_score": 0.29070961538461537,
          "counts": {
            "correct": 96,
            "false_negative": 24,
            "false_positive": 16,
            "hedge": 14,
            "unparseable": 0,
            "hallucinated": 25,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.29411764705882354,
            "domain": 0.3125,
            "handle": 0.11290322580645161,
            "cultural": 0.06,
            "sound": 0.31
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 136,
              "n_hedged": 14,
              "n_unparseable": 0,
              "hallucination_rate": 0.29411764705882354,
              "accuracy_among_assertions": 0.7058823529411765,
              "hedge_rate": 0.09333333333333334,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 96,
              "n_hedged": 4,
              "n_unparseable": 50,
              "hallucination_rate": 0.3125,
              "accuracy_among_assertions": 0.6875,
              "hedge_rate": 0.02666666666666667,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 62,
              "n_hedged": 38,
              "n_unparseable": 50,
              "hallucination_rate": 0.11290322580645161,
              "accuracy_among_assertions": 0.6774193548387096,
              "hedge_rate": 0.25333333333333335,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.06,
              "accuracy_among_assertions": 0.8,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.31,
              "accuracy_among_assertions": 0.55,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "ai-agent": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.5666666666666667,
          "accuracy_among_scored": 0.5666666666666667,
          "false_negative_rate": 0.22549019607843138,
          "false_positive_rate": 0.5,
          "hedge_rate": 0.12,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.19333333333333333,
          "confidence_brier_score": 0.3026425742574258,
          "counts": {
            "correct": 85,
            "false_negative": 23,
            "false_positive": 24,
            "hedge": 18,
            "unparseable": 0,
            "hallucinated": 29,
            "excluded": 0,
            "n_risky_truth": 102,
            "n_safe_truth": 48
          },
          "surface_hallucination_rate": {
            "trademark": 0.3560606060606061,
            "domain": 0.3673469387755102,
            "handle": 0.25,
            "cultural": 0.15,
            "sound": 0.28
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 132,
              "n_hedged": 18,
              "n_unparseable": 0,
              "hallucination_rate": 0.3560606060606061,
              "accuracy_among_assertions": 0.6439393939393939,
              "hedge_rate": 0.12,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 98,
              "n_hedged": 2,
              "n_unparseable": 50,
              "hallucination_rate": 0.3673469387755102,
              "accuracy_among_assertions": 0.6326530612244898,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 40,
              "n_hedged": 60,
              "n_unparseable": 50,
              "hallucination_rate": 0.25,
              "accuracy_among_assertions": 0.75,
              "hedge_rate": 0.4,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.15,
              "accuracy_among_assertions": 0.85,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.28,
              "accuracy_among_assertions": 0.72,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "biotech-pharma": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6,
          "accuracy_among_scored": 0.6,
          "false_negative_rate": 0.3047619047619048,
          "false_positive_rate": 0.35555555555555557,
          "hedge_rate": 0.08,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.19333333333333333,
          "confidence_brier_score": 0.31112636363636365,
          "counts": {
            "correct": 90,
            "false_negative": 32,
            "false_positive": 16,
            "hedge": 12,
            "unparseable": 0,
            "hallucinated": 29,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.34782608695652173,
            "domain": 0.3645833333333333,
            "handle": 0.11764705882352941,
            "cultural": 0.02,
            "sound": 0.25
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 138,
              "n_hedged": 12,
              "n_unparseable": 0,
              "hallucination_rate": 0.34782608695652173,
              "accuracy_among_assertions": 0.6521739130434783,
              "hedge_rate": 0.08,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 96,
              "n_hedged": 4,
              "n_unparseable": 50,
              "hallucination_rate": 0.3645833333333333,
              "accuracy_among_assertions": 0.6354166666666666,
              "hedge_rate": 0.02666666666666667,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 51,
              "n_hedged": 49,
              "n_unparseable": 50,
              "hallucination_rate": 0.11764705882352941,
              "accuracy_among_assertions": 0.8823529411764706,
              "hedge_rate": 0.32666666666666666,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.02,
              "accuracy_among_assertions": 0.98,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.25,
              "accuracy_among_assertions": 0.75,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "dev-tools": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.58,
          "accuracy_among_scored": 0.58,
          "false_negative_rate": 0.1619047619047619,
          "false_positive_rate": 0.5555555555555556,
          "hedge_rate": 0.13333333333333333,
          "unparseable_rate": 0.006666666666666667,
          "hallucination_rate": 0.14666666666666667,
          "confidence_brier_score": 0.2765989795918367,
          "counts": {
            "correct": 87,
            "false_negative": 17,
            "false_positive": 25,
            "hedge": 20,
            "unparseable": 1,
            "hallucinated": 22,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.32558139534883723,
            "domain": 0.29896907216494845,
            "handle": 0.16216216216216217,
            "cultural": 0.01,
            "sound": 0.27
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 129,
              "n_hedged": 20,
              "n_unparseable": 1,
              "hallucination_rate": 0.32558139534883723,
              "accuracy_among_assertions": 0.6744186046511628,
              "hedge_rate": 0.13333333333333333,
              "unparseable_rate": 0.006666666666666667
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 97,
              "n_hedged": 3,
              "n_unparseable": 50,
              "hallucination_rate": 0.29896907216494845,
              "accuracy_among_assertions": 0.7010309278350515,
              "hedge_rate": 0.02,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 37,
              "n_hedged": 63,
              "n_unparseable": 50,
              "hallucination_rate": 0.16216216216216217,
              "accuracy_among_assertions": 0.8378378378378378,
              "hedge_rate": 0.42,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.01,
              "accuracy_among_assertions": 0.99,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.27,
              "accuracy_among_assertions": 0.73,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "dtc-consumer": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6333333333333333,
          "accuracy_among_scored": 0.6333333333333333,
          "false_negative_rate": 0.21904761904761905,
          "false_positive_rate": 0.5111111111111111,
          "hedge_rate": 0.06,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.24,
          "confidence_brier_score": 0.31254245283018867,
          "counts": {
            "correct": 95,
            "false_negative": 23,
            "false_positive": 23,
            "hedge": 9,
            "unparseable": 0,
            "hallucinated": 36,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.3262411347517731,
            "domain": 0.37373737373737376,
            "handle": 0.16071428571428573,
            "cultural": 0.12,
            "sound": 0.18
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 141,
              "n_hedged": 9,
              "n_unparseable": 0,
              "hallucination_rate": 0.3262411347517731,
              "accuracy_among_assertions": 0.6737588652482269,
              "hedge_rate": 0.06,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 99,
              "n_hedged": 1,
              "n_unparseable": 50,
              "hallucination_rate": 0.37373737373737376,
              "accuracy_among_assertions": 0.6262626262626263,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 56,
              "n_hedged": 44,
              "n_unparseable": 50,
              "hallucination_rate": 0.16071428571428573,
              "accuracy_among_assertions": 0.7678571428571429,
              "hedge_rate": 0.29333333333333333,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.12,
              "accuracy_among_assertions": 0.84,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.18,
              "accuracy_among_assertions": 0.78,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "fintech": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6866666666666666,
          "accuracy_among_scored": 0.6866666666666666,
          "false_negative_rate": 0.11428571428571428,
          "false_positive_rate": 0.4888888888888889,
          "hedge_rate": 0.08666666666666667,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.10666666666666667,
          "confidence_brier_score": 0.2421514285714286,
          "counts": {
            "correct": 103,
            "false_negative": 12,
            "false_positive": 22,
            "hedge": 13,
            "unparseable": 0,
            "hallucinated": 16,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.24817518248175183,
            "domain": 0.3333333333333333,
            "handle": 0.08,
            "cultural": 0.05,
            "sound": 0.18
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 137,
              "n_hedged": 13,
              "n_unparseable": 0,
              "hallucination_rate": 0.24817518248175183,
              "accuracy_among_assertions": 0.7518248175182481,
              "hedge_rate": 0.08666666666666667,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 99,
              "n_hedged": 1,
              "n_unparseable": 50,
              "hallucination_rate": 0.3333333333333333,
              "accuracy_among_assertions": 0.6666666666666666,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 50,
              "n_hedged": 50,
              "n_unparseable": 50,
              "hallucination_rate": 0.08,
              "accuracy_among_assertions": 0.92,
              "hedge_rate": 0.3333333333333333,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.05,
              "accuracy_among_assertions": 0.95,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.18,
              "accuracy_among_assertions": 0.82,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "gaming": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.66,
          "accuracy_among_scored": 0.66,
          "false_negative_rate": 0.17142857142857143,
          "false_positive_rate": 0.4222222222222222,
          "hedge_rate": 0.09333333333333334,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.19333333333333333,
          "confidence_brier_score": 0.33305412844036697,
          "counts": {
            "correct": 99,
            "false_negative": 18,
            "false_positive": 19,
            "hedge": 14,
            "unparseable": 0,
            "hallucinated": 29,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.27205882352941174,
            "domain": 0.37,
            "handle": 0.1694915254237288,
            "cultural": 0.13,
            "sound": 0.27
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 136,
              "n_hedged": 14,
              "n_unparseable": 0,
              "hallucination_rate": 0.27205882352941174,
              "accuracy_among_assertions": 0.7279411764705882,
              "hedge_rate": 0.09333333333333334,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.37,
              "accuracy_among_assertions": 0.63,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 59,
              "n_hedged": 41,
              "n_unparseable": 50,
              "hallucination_rate": 0.1694915254237288,
              "accuracy_among_assertions": 0.8305084745762712,
              "hedge_rate": 0.2733333333333333,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.13,
              "accuracy_among_assertions": 0.87,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.27,
              "accuracy_among_assertions": 0.73,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "indie-maker": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.6733333333333333,
          "accuracy_among_scored": 0.6733333333333333,
          "false_negative_rate": 0.18095238095238095,
          "false_positive_rate": 0.5333333333333333,
          "hedge_rate": 0.04,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.14,
          "confidence_brier_score": 0.2663254716981132,
          "counts": {
            "correct": 101,
            "false_negative": 19,
            "false_positive": 24,
            "hedge": 6,
            "unparseable": 0,
            "hallucinated": 21,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.2986111111111111,
            "domain": 0.3469387755102041,
            "handle": 0.2641509433962264,
            "cultural": 0.02,
            "sound": 0.38
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 144,
              "n_hedged": 6,
              "n_unparseable": 0,
              "hallucination_rate": 0.2986111111111111,
              "accuracy_among_assertions": 0.7013888888888888,
              "hedge_rate": 0.04,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 98,
              "n_hedged": 2,
              "n_unparseable": 50,
              "hallucination_rate": 0.3469387755102041,
              "accuracy_among_assertions": 0.6530612244897959,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 53,
              "n_hedged": 47,
              "n_unparseable": 50,
              "hallucination_rate": 0.2641509433962264,
              "accuracy_among_assertions": 0.7358490566037735,
              "hedge_rate": 0.31333333333333335,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.02,
              "accuracy_among_assertions": 0.98,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.38,
              "accuracy_among_assertions": 0.62,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "restaurant-food": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.66,
          "accuracy_among_scored": 0.66,
          "false_negative_rate": 0.21904761904761905,
          "false_positive_rate": 0.3333333333333333,
          "hedge_rate": 0.08,
          "unparseable_rate": 0.006666666666666667,
          "hallucination_rate": 0.18,
          "confidence_brier_score": 0.30858053097345134,
          "counts": {
            "correct": 99,
            "false_negative": 23,
            "false_positive": 15,
            "hedge": 12,
            "unparseable": 1,
            "hallucinated": 27,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.2773722627737226,
            "domain": 0.30612244897959184,
            "handle": 0.1724137931034483,
            "cultural": 0.04,
            "sound": 0.34
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 137,
              "n_hedged": 12,
              "n_unparseable": 1,
              "hallucination_rate": 0.2773722627737226,
              "accuracy_among_assertions": 0.7226277372262774,
              "hedge_rate": 0.08,
              "unparseable_rate": 0.006666666666666667
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 98,
              "n_hedged": 2,
              "n_unparseable": 50,
              "hallucination_rate": 0.30612244897959184,
              "accuracy_among_assertions": 0.6938775510204082,
              "hedge_rate": 0.013333333333333334,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 58,
              "n_hedged": 42,
              "n_unparseable": 50,
              "hallucination_rate": 0.1724137931034483,
              "accuracy_among_assertions": 0.7068965517241379,
              "hedge_rate": 0.28,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.04,
              "accuracy_among_assertions": 0.86,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.34,
              "accuracy_among_assertions": 0.56,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "saas-b2b": {
          "n": 150,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 150,
          "accuracy": 0.64,
          "accuracy_among_scored": 0.64,
          "false_negative_rate": 0.12380952380952381,
          "false_positive_rate": 0.4444444444444444,
          "hedge_rate": 0.14,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.2,
          "confidence_brier_score": 0.23377524752475248,
          "counts": {
            "correct": 96,
            "false_negative": 13,
            "false_positive": 20,
            "hedge": 21,
            "unparseable": 0,
            "hallucinated": 30,
            "excluded": 0,
            "n_risky_truth": 105,
            "n_safe_truth": 45
          },
          "surface_hallucination_rate": {
            "trademark": 0.2558139534883721,
            "domain": 0.40404040404040403,
            "handle": 0.09090909090909091,
            "cultural": 0.03,
            "sound": 0.2
          },
          "surface_detail": {
            "trademark": {
              "n": 150,
              "n_confident_assertions": 129,
              "n_hedged": 21,
              "n_unparseable": 0,
              "hallucination_rate": 0.2558139534883721,
              "accuracy_among_assertions": 0.7441860465116279,
              "hedge_rate": 0.14,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 150,
              "n_confident_assertions": 99,
              "n_hedged": 1,
              "n_unparseable": 50,
              "hallucination_rate": 0.40404040404040403,
              "accuracy_among_assertions": 0.5959595959595959,
              "hedge_rate": 0.006666666666666667,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 150,
              "n_confident_assertions": 44,
              "n_hedged": 56,
              "n_unparseable": 50,
              "hallucination_rate": 0.09090909090909091,
              "accuracy_among_assertions": 0.9090909090909091,
              "hedge_rate": 0.37333333333333335,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.03,
              "accuracy_among_assertions": 0.97,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 150,
              "n_confident_assertions": 100,
              "n_hedged": 0,
              "n_unparseable": 50,
              "hallucination_rate": 0.2,
              "accuracy_among_assertions": 0.8,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        }
      },
      "by_difficulty": {
        "easy": {
          "n": 900,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 900,
          "accuracy": 0.5522222222222222,
          "accuracy_among_scored": 0.5522222222222222,
          "false_negative_rate": 0.19111111111111112,
          "false_positive_rate": 0.4488888888888889,
          "hedge_rate": 0.12555555555555556,
          "unparseable_rate": 0.0022222222222222222,
          "hallucination_rate": 0.14777777777777779,
          "confidence_brier_score": 0.28309036544850497,
          "counts": {
            "correct": 497,
            "false_negative": 86,
            "false_positive": 202,
            "hedge": 113,
            "unparseable": 2,
            "hallucinated": 133,
            "excluded": 0,
            "n_risky_truth": 450,
            "n_safe_truth": 450
          },
          "surface_hallucination_rate": {
            "trademark": 0.3668789808917197,
            "domain": 0.39152542372881355,
            "handle": 0.16932907348242812,
            "cultural": 0.07,
            "sound": 0.3233333333333333
          },
          "surface_detail": {
            "trademark": {
              "n": 900,
              "n_confident_assertions": 785,
              "n_hedged": 113,
              "n_unparseable": 2,
              "hallucination_rate": 0.3668789808917197,
              "accuracy_among_assertions": 0.6331210191082802,
              "hedge_rate": 0.12555555555555556,
              "unparseable_rate": 0.0022222222222222222
            },
            "domain": {
              "n": 900,
              "n_confident_assertions": 590,
              "n_hedged": 10,
              "n_unparseable": 300,
              "hallucination_rate": 0.39152542372881355,
              "accuracy_among_assertions": 0.6084745762711864,
              "hedge_rate": 0.011111111111111112,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 900,
              "n_confident_assertions": 313,
              "n_hedged": 287,
              "n_unparseable": 300,
              "hallucination_rate": 0.16932907348242812,
              "accuracy_among_assertions": 0.7603833865814696,
              "hedge_rate": 0.3188888888888889,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 900,
              "n_confident_assertions": 600,
              "n_hedged": 0,
              "n_unparseable": 300,
              "hallucination_rate": 0.07,
              "accuracy_among_assertions": 0.8866666666666667,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 900,
              "n_confident_assertions": 600,
              "n_hedged": 0,
              "n_unparseable": 300,
              "hallucination_rate": 0.3233333333333333,
              "accuracy_among_assertions": 0.6333333333333333,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "hard": {
          "n": 510,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 510,
          "accuracy": 0.7588235294117647,
          "accuracy_among_scored": 0.7588235294117647,
          "false_negative_rate": 0.19215686274509805,
          "false_positive_rate": null,
          "hedge_rate": 0.049019607843137254,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.20784313725490197,
          "confidence_brier_score": 0.2965728947368421,
          "counts": {
            "correct": 387,
            "false_negative": 98,
            "false_positive": 0,
            "hedge": 25,
            "unparseable": 0,
            "hallucinated": 106,
            "excluded": 0,
            "n_risky_truth": 510,
            "n_safe_truth": 0
          },
          "surface_hallucination_rate": {
            "trademark": 0.2020618556701031,
            "domain": 0.25075528700906347,
            "handle": 0.13690476190476192,
            "cultural": 0.052941176470588235,
            "sound": 0.17058823529411765
          },
          "surface_detail": {
            "trademark": {
              "n": 510,
              "n_confident_assertions": 485,
              "n_hedged": 25,
              "n_unparseable": 0,
              "hallucination_rate": 0.2020618556701031,
              "accuracy_among_assertions": 0.797938144329897,
              "hedge_rate": 0.049019607843137254,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 510,
              "n_confident_assertions": 331,
              "n_hedged": 9,
              "n_unparseable": 170,
              "hallucination_rate": 0.25075528700906347,
              "accuracy_among_assertions": 0.7492447129909365,
              "hedge_rate": 0.01764705882352941,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 510,
              "n_confident_assertions": 168,
              "n_hedged": 172,
              "n_unparseable": 170,
              "hallucination_rate": 0.13690476190476192,
              "accuracy_among_assertions": 0.8511904761904762,
              "hedge_rate": 0.33725490196078434,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 510,
              "n_confident_assertions": 340,
              "n_hedged": 0,
              "n_unparseable": 170,
              "hallucination_rate": 0.052941176470588235,
              "accuracy_among_assertions": 0.9411764705882353,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 510,
              "n_confident_assertions": 340,
              "n_hedged": 0,
              "n_unparseable": 170,
              "hallucination_rate": 0.17058823529411765,
              "accuracy_among_assertions": 0.8235294117647058,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        },
        "medium": {
          "n": 90,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 90,
          "accuracy": 0.7444444444444445,
          "accuracy_among_scored": 0.7444444444444445,
          "false_negative_rate": 0.22988505747126436,
          "false_positive_rate": 0.6666666666666666,
          "hedge_rate": 0.011111111111111112,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.2777777777777778,
          "confidence_brier_score": 0.28990000000000005,
          "counts": {
            "correct": 67,
            "false_negative": 20,
            "false_positive": 2,
            "hedge": 1,
            "unparseable": 0,
            "hallucinated": 25,
            "excluded": 0,
            "n_risky_truth": 87,
            "n_safe_truth": 3
          },
          "surface_hallucination_rate": {
            "trademark": 0.24719101123595505,
            "domain": 0.4576271186440678,
            "handle": 0.13793103448275862,
            "cultural": 0.05,
            "sound": 0.23333333333333334
          },
          "surface_detail": {
            "trademark": {
              "n": 90,
              "n_confident_assertions": 89,
              "n_hedged": 1,
              "n_unparseable": 0,
              "hallucination_rate": 0.24719101123595505,
              "accuracy_among_assertions": 0.7528089887640449,
              "hedge_rate": 0.011111111111111112,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 90,
              "n_confident_assertions": 59,
              "n_hedged": 1,
              "n_unparseable": 30,
              "hallucination_rate": 0.4576271186440678,
              "accuracy_among_assertions": 0.5423728813559322,
              "hedge_rate": 0.011111111111111112,
              "unparseable_rate": 0.3333333333333333
            },
            "handle": {
              "n": 90,
              "n_confident_assertions": 29,
              "n_hedged": 31,
              "n_unparseable": 30,
              "hallucination_rate": 0.13793103448275862,
              "accuracy_among_assertions": 0.8620689655172413,
              "hedge_rate": 0.34444444444444444,
              "unparseable_rate": 0.3333333333333333
            },
            "cultural": {
              "n": 90,
              "n_confident_assertions": 60,
              "n_hedged": 0,
              "n_unparseable": 30,
              "hallucination_rate": 0.05,
              "accuracy_among_assertions": 0.95,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            },
            "sound": {
              "n": 90,
              "n_confident_assertions": 60,
              "n_hedged": 0,
              "n_unparseable": 30,
              "hallucination_rate": 0.23333333333333334,
              "accuracy_among_assertions": 0.7666666666666667,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.3333333333333333
            }
          }
        }
      },
      "by_prompt_version": {
        "v1_naive": {
          "n": 500,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 500,
          "accuracy": 0.692,
          "accuracy_among_scored": 0.692,
          "false_negative_rate": 0.0,
          "false_positive_rate": 0.9139072847682119,
          "hedge_rate": 0.028,
          "unparseable_rate": 0.004,
          "hallucination_rate": 0.014,
          "confidence_brier_score": 0.46101685393258424,
          "counts": {
            "correct": 346,
            "false_negative": 0,
            "false_positive": 138,
            "hedge": 14,
            "unparseable": 2,
            "hallucinated": 7,
            "excluded": 0,
            "n_risky_truth": 349,
            "n_safe_truth": 151
          },
          "surface_hallucination_rate": {
            "trademark": 0.28512396694214875,
            "domain": null,
            "handle": null,
            "cultural": null,
            "sound": null
          },
          "surface_detail": {
            "trademark": {
              "n": 500,
              "n_confident_assertions": 484,
              "n_hedged": 14,
              "n_unparseable": 2,
              "hallucination_rate": 0.28512396694214875,
              "accuracy_among_assertions": 0.7148760330578512,
              "hedge_rate": 0.028,
              "unparseable_rate": 0.004
            },
            "domain": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            },
            "handle": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            },
            "cultural": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            },
            "sound": {
              "n": 500,
              "n_confident_assertions": 0,
              "n_hedged": 0,
              "n_unparseable": 500,
              "hallucination_rate": null,
              "accuracy_among_assertions": null,
              "hedge_rate": 0.0,
              "unparseable_rate": 1.0
            }
          }
        },
        "v2_constrained": {
          "n": 500,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 500,
          "accuracy": 0.38,
          "accuracy_among_scored": 0.38,
          "false_negative_rate": 0.5329512893982808,
          "false_positive_rate": 0.006622516556291391,
          "hedge_rate": 0.246,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.118,
          "confidence_brier_score": 0.33146551724137935,
          "counts": {
            "correct": 190,
            "false_negative": 186,
            "false_positive": 1,
            "hedge": 123,
            "unparseable": 0,
            "hallucinated": 59,
            "excluded": 0,
            "n_risky_truth": 349,
            "n_safe_truth": 151
          },
          "surface_hallucination_rate": {
            "trademark": 0.4960212201591512,
            "domain": 0.412,
            "handle": 0.17567567567567569,
            "cultural": 0.058,
            "sound": 0.322
          },
          "surface_detail": {
            "trademark": {
              "n": 500,
              "n_confident_assertions": 377,
              "n_hedged": 123,
              "n_unparseable": 0,
              "hallucination_rate": 0.4960212201591512,
              "accuracy_among_assertions": 0.5039787798408488,
              "hedge_rate": 0.246,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 500,
              "n_confident_assertions": 500,
              "n_hedged": 0,
              "n_unparseable": 0,
              "hallucination_rate": 0.412,
              "accuracy_among_assertions": 0.588,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.0
            },
            "handle": {
              "n": 500,
              "n_confident_assertions": 444,
              "n_hedged": 56,
              "n_unparseable": 0,
              "hallucination_rate": 0.17567567567567569,
              "accuracy_among_assertions": 0.7995495495495496,
              "hedge_rate": 0.112,
              "unparseable_rate": 0.0
            },
            "cultural": {
              "n": 500,
              "n_confident_assertions": 500,
              "n_hedged": 0,
              "n_unparseable": 0,
              "hallucination_rate": 0.058,
              "accuracy_among_assertions": 0.914,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.0
            },
            "sound": {
              "n": 500,
              "n_confident_assertions": 500,
              "n_hedged": 0,
              "n_unparseable": 0,
              "hallucination_rate": 0.322,
              "accuracy_among_assertions": 0.65,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.0
            }
          }
        },
        "v3_grounded": {
          "n": 500,
          "n_excluded_max_tokens_truncated": 0,
          "n_scored": 500,
          "accuracy": 0.83,
          "accuracy_among_scored": 0.83,
          "false_negative_rate": 0.05157593123209169,
          "false_positive_rate": 0.4304635761589404,
          "hedge_rate": 0.004,
          "unparseable_rate": 0.0,
          "hallucination_rate": 0.396,
          "confidence_brier_score": 0.1941315261044177,
          "counts": {
            "correct": 415,
            "false_negative": 18,
            "false_positive": 65,
            "hedge": 2,
            "unparseable": 0,
            "hallucinated": 198,
            "excluded": 0,
            "n_risky_truth": 349,
            "n_safe_truth": 151
          },
          "surface_hallucination_rate": {
            "trademark": 0.16666666666666666,
            "domain": 0.28125,
            "handle": 0.030303030303030304,
            "cultural": 0.068,
            "sound": 0.21
          },
          "surface_detail": {
            "trademark": {
              "n": 500,
              "n_confident_assertions": 498,
              "n_hedged": 2,
              "n_unparseable": 0,
              "hallucination_rate": 0.16666666666666666,
              "accuracy_among_assertions": 0.8333333333333334,
              "hedge_rate": 0.004,
              "unparseable_rate": 0.0
            },
            "domain": {
              "n": 500,
              "n_confident_assertions": 480,
              "n_hedged": 20,
              "n_unparseable": 0,
              "hallucination_rate": 0.28125,
              "accuracy_among_assertions": 0.71875,
              "hedge_rate": 0.04,
              "unparseable_rate": 0.0
            },
            "handle": {
              "n": 500,
              "n_confident_assertions": 66,
              "n_hedged": 434,
              "n_unparseable": 0,
              "hallucination_rate": 0.030303030303030304,
              "accuracy_among_assertions": 0.7727272727272727,
              "hedge_rate": 0.868,
              "unparseable_rate": 0.0
            },
            "cultural": {
              "n": 500,
              "n_confident_assertions": 500,
              "n_hedged": 0,
              "n_unparseable": 0,
              "hallucination_rate": 0.068,
              "accuracy_among_assertions": 0.904,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.0
            },
            "sound": {
              "n": 500,
              "n_confident_assertions": 500,
              "n_hedged": 0,
              "n_unparseable": 0,
              "hallucination_rate": 0.21,
              "accuracy_among_assertions": 0.762,
              "hedge_rate": 0.0,
              "unparseable_rate": 0.0
            }
          }
        }
      }
    }
  },
  "by_trap_type": {
    "dead_mark_lookalike": {
      "claude-opus-4-7": {
        "n": 3,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 3,
        "accuracy": 1.0,
        "accuracy_among_scored": 1.0,
        "false_negative_rate": 0.0,
        "false_positive_rate": null,
        "hedge_rate": 0.0,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.0,
        "confidence_brier_score": 0.022500000000000006,
        "counts": {
          "correct": 3,
          "false_negative": 0,
          "false_positive": 0,
          "hedge": 0,
          "unparseable": 0,
          "hallucinated": 0,
          "excluded": 0,
          "n_risky_truth": 3,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.0,
          "domain": 1.0,
          "handle": null,
          "cultural": 1.0,
          "sound": 0.0
        },
        "surface_detail": {
          "trademark": {
            "n": 3,
            "n_confident_assertions": 3,
            "n_hedged": 0,
            "n_unparseable": 0,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 3,
            "n_confident_assertions": 2,
            "n_hedged": 0,
            "n_unparseable": 1,
            "hallucination_rate": 1.0,
            "accuracy_among_assertions": 0.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 3,
            "n_confident_assertions": 0,
            "n_hedged": 2,
            "n_unparseable": 1,
            "hallucination_rate": null,
            "accuracy_among_assertions": null,
            "hedge_rate": 0.6666666666666666,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 3,
            "n_confident_assertions": 2,
            "n_hedged": 0,
            "n_unparseable": 1,
            "hallucination_rate": 1.0,
            "accuracy_among_assertions": 0.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 3,
            "n_confident_assertions": 2,
            "n_hedged": 0,
            "n_unparseable": 1,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      },
      "gemini-2.5-pro": {
        "n": 3,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 3,
        "accuracy": 1.0,
        "accuracy_among_scored": 1.0,
        "false_negative_rate": 0.0,
        "false_positive_rate": null,
        "hedge_rate": 0.0,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.3333333333333333,
        "confidence_brier_score": 0.3243,
        "counts": {
          "correct": 3,
          "false_negative": 0,
          "false_positive": 0,
          "hedge": 0,
          "unparseable": 0,
          "hallucinated": 1,
          "excluded": 0,
          "n_risky_truth": 3,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.0,
          "domain": 1.0,
          "handle": 0.0,
          "cultural": 0.5,
          "sound": 0.0
        },
        "surface_detail": {
          "trademark": {
            "n": 3,
            "n_confident_assertions": 3,
            "n_hedged": 0,
            "n_unparseable": 0,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 3,
            "n_confident_assertions": 2,
            "n_hedged": 0,
            "n_unparseable": 1,
            "hallucination_rate": 1.0,
            "accuracy_among_assertions": 0.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 3,
            "n_confident_assertions": 1,
            "n_hedged": 1,
            "n_unparseable": 1,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.3333333333333333,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 3,
            "n_confident_assertions": 2,
            "n_hedged": 0,
            "n_unparseable": 1,
            "hallucination_rate": 0.5,
            "accuracy_among_assertions": 0.5,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 3,
            "n_confident_assertions": 2,
            "n_hedged": 0,
            "n_unparseable": 1,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      },
      "gpt-4o-mini": {
        "n": 3,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 3,
        "accuracy": 1.0,
        "accuracy_among_scored": 1.0,
        "false_negative_rate": 0.0,
        "false_positive_rate": null,
        "hedge_rate": 0.0,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.0,
        "confidence_brier_score": 0.3809666666666667,
        "counts": {
          "correct": 3,
          "false_negative": 0,
          "false_positive": 0,
          "hedge": 0,
          "unparseable": 0,
          "hallucinated": 0,
          "excluded": 0,
          "n_risky_truth": 3,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.0,
          "domain": 1.0,
          "handle": 0.0,
          "cultural": 0.5,
          "sound": 0.0
        },
        "surface_detail": {
          "trademark": {
            "n": 3,
            "n_confident_assertions": 3,
            "n_hedged": 0,
            "n_unparseable": 0,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 3,
            "n_confident_assertions": 2,
            "n_hedged": 0,
            "n_unparseable": 1,
            "hallucination_rate": 1.0,
            "accuracy_among_assertions": 0.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 3,
            "n_confident_assertions": 1,
            "n_hedged": 1,
            "n_unparseable": 1,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.3333333333333333,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 3,
            "n_confident_assertions": 2,
            "n_hedged": 0,
            "n_unparseable": 1,
            "hallucination_rate": 0.5,
            "accuracy_among_assertions": 0.5,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 3,
            "n_confident_assertions": 2,
            "n_hedged": 0,
            "n_unparseable": 1,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      }
    },
    "famous_mark": {
      "claude-opus-4-7": {
        "n": 624,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 624,
        "accuracy": 0.9439102564102564,
        "accuracy_among_scored": 0.9439102564102564,
        "false_negative_rate": 0.016025641025641024,
        "false_positive_rate": null,
        "hedge_rate": 0.04006410256410257,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.09775641025641026,
        "confidence_brier_score": 0.17422613861386138,
        "counts": {
          "correct": 589,
          "false_negative": 10,
          "false_positive": 0,
          "hedge": 25,
          "unparseable": 0,
          "hallucinated": 61,
          "excluded": 0,
          "n_risky_truth": 624,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.01669449081803005,
          "domain": 0.021686746987951807,
          "handle": 0.0,
          "cultural": 0.06490384615384616,
          "sound": 0.24519230769230768
        },
        "surface_detail": {
          "trademark": {
            "n": 624,
            "n_confident_assertions": 599,
            "n_hedged": 25,
            "n_unparseable": 0,
            "hallucination_rate": 0.01669449081803005,
            "accuracy_among_assertions": 0.9833055091819699,
            "hedge_rate": 0.04006410256410257,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 624,
            "n_confident_assertions": 415,
            "n_hedged": 1,
            "n_unparseable": 208,
            "hallucination_rate": 0.021686746987951807,
            "accuracy_among_assertions": 0.9783132530120482,
            "hedge_rate": 0.0016025641025641025,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 624,
            "n_confident_assertions": 176,
            "n_hedged": 240,
            "n_unparseable": 208,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.38461538461538464,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 624,
            "n_confident_assertions": 416,
            "n_hedged": 0,
            "n_unparseable": 208,
            "hallucination_rate": 0.06490384615384616,
            "accuracy_among_assertions": 0.9014423076923077,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 624,
            "n_confident_assertions": 416,
            "n_hedged": 0,
            "n_unparseable": 208,
            "hallucination_rate": 0.24519230769230768,
            "accuracy_among_assertions": 0.7211538461538461,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      },
      "gemini-2.5-pro": {
        "n": 624,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 624,
        "accuracy": 0.8942307692307693,
        "accuracy_among_scored": 0.8942307692307693,
        "false_negative_rate": 0.057692307692307696,
        "false_positive_rate": null,
        "hedge_rate": 0.014423076923076924,
        "unparseable_rate": 0.03365384615384615,
        "hallucination_rate": 0.5080128205128205,
        "confidence_brier_score": 0.2819300869565218,
        "counts": {
          "correct": 558,
          "false_negative": 36,
          "false_positive": 0,
          "hedge": 9,
          "unparseable": 21,
          "hallucinated": 317,
          "excluded": 0,
          "n_risky_truth": 624,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.06060606060606061,
          "domain": 0.027989821882951654,
          "handle": 0.0045662100456621,
          "cultural": 0.06582278481012659,
          "sound": 0.2230769230769231
        },
        "surface_detail": {
          "trademark": {
            "n": 624,
            "n_confident_assertions": 594,
            "n_hedged": 9,
            "n_unparseable": 21,
            "hallucination_rate": 0.06060606060606061,
            "accuracy_among_assertions": 0.9393939393939394,
            "hedge_rate": 0.014423076923076924,
            "unparseable_rate": 0.03365384615384615
          },
          "domain": {
            "n": 624,
            "n_confident_assertions": 393,
            "n_hedged": 2,
            "n_unparseable": 229,
            "hallucination_rate": 0.027989821882951654,
            "accuracy_among_assertions": 0.9720101781170484,
            "hedge_rate": 0.003205128205128205,
            "unparseable_rate": 0.36698717948717946
          },
          "handle": {
            "n": 624,
            "n_confident_assertions": 219,
            "n_hedged": 176,
            "n_unparseable": 229,
            "hallucination_rate": 0.0045662100456621,
            "accuracy_among_assertions": 0.9315068493150684,
            "hedge_rate": 0.28205128205128205,
            "unparseable_rate": 0.36698717948717946
          },
          "cultural": {
            "n": 624,
            "n_confident_assertions": 395,
            "n_hedged": 0,
            "n_unparseable": 229,
            "hallucination_rate": 0.06582278481012659,
            "accuracy_among_assertions": 0.8987341772151899,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.36698717948717946
          },
          "sound": {
            "n": 624,
            "n_confident_assertions": 390,
            "n_hedged": 5,
            "n_unparseable": 229,
            "hallucination_rate": 0.2230769230769231,
            "accuracy_among_assertions": 0.7410256410256411,
            "hedge_rate": 0.008012820512820512,
            "unparseable_rate": 0.36698717948717946
          }
        }
      },
      "gpt-4o-mini": {
        "n": 624,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 624,
        "accuracy": 0.7884615384615384,
        "accuracy_among_scored": 0.7884615384615384,
        "false_negative_rate": 0.20512820512820512,
        "false_positive_rate": null,
        "hedge_rate": 0.00641025641025641,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.28044871794871795,
        "confidence_brier_score": 0.3311266536964981,
        "counts": {
          "correct": 492,
          "false_negative": 128,
          "false_positive": 0,
          "hedge": 4,
          "unparseable": 0,
          "hallucinated": 175,
          "excluded": 0,
          "n_risky_truth": 624,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.2064516129032258,
          "domain": 0.03155339805825243,
          "handle": 0.045454545454545456,
          "cultural": 0.10817307692307693,
          "sound": 0.21153846153846154
        },
        "surface_detail": {
          "trademark": {
            "n": 624,
            "n_confident_assertions": 620,
            "n_hedged": 4,
            "n_unparseable": 0,
            "hallucination_rate": 0.2064516129032258,
            "accuracy_among_assertions": 0.7935483870967742,
            "hedge_rate": 0.00641025641025641,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 624,
            "n_confident_assertions": 412,
            "n_hedged": 4,
            "n_unparseable": 208,
            "hallucination_rate": 0.03155339805825243,
            "accuracy_among_assertions": 0.9684466019417476,
            "hedge_rate": 0.00641025641025641,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 624,
            "n_confident_assertions": 220,
            "n_hedged": 196,
            "n_unparseable": 208,
            "hallucination_rate": 0.045454545454545456,
            "accuracy_among_assertions": 0.8954545454545455,
            "hedge_rate": 0.3141025641025641,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 624,
            "n_confident_assertions": 416,
            "n_hedged": 0,
            "n_unparseable": 208,
            "hallucination_rate": 0.10817307692307693,
            "accuracy_among_assertions": 0.8581730769230769,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 624,
            "n_confident_assertions": 416,
            "n_hedged": 0,
            "n_unparseable": 208,
            "hallucination_rate": 0.21153846153846154,
            "accuracy_among_assertions": 0.7548076923076923,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      }
    },
    "foreign_brand": {
      "claude-opus-4-7": {
        "n": 12,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 12,
        "accuracy": 0.8333333333333334,
        "accuracy_among_scored": 0.8333333333333334,
        "false_negative_rate": 0.0,
        "false_positive_rate": null,
        "hedge_rate": 0.16666666666666666,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.16666666666666666,
        "confidence_brier_score": 0.2660625,
        "counts": {
          "correct": 10,
          "false_negative": 0,
          "false_positive": 0,
          "hedge": 2,
          "unparseable": 0,
          "hallucinated": 2,
          "excluded": 0,
          "n_risky_truth": 12,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.0,
          "domain": 0.2,
          "handle": null,
          "cultural": 0.125,
          "sound": 0.5
        },
        "surface_detail": {
          "trademark": {
            "n": 12,
            "n_confident_assertions": 10,
            "n_hedged": 2,
            "n_unparseable": 0,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.16666666666666666,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 12,
            "n_confident_assertions": 5,
            "n_hedged": 3,
            "n_unparseable": 4,
            "hallucination_rate": 0.2,
            "accuracy_among_assertions": 0.8,
            "hedge_rate": 0.25,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 12,
            "n_confident_assertions": 0,
            "n_hedged": 8,
            "n_unparseable": 4,
            "hallucination_rate": null,
            "accuracy_among_assertions": null,
            "hedge_rate": 0.6666666666666666,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 12,
            "n_confident_assertions": 8,
            "n_hedged": 0,
            "n_unparseable": 4,
            "hallucination_rate": 0.125,
            "accuracy_among_assertions": 0.875,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 12,
            "n_confident_assertions": 8,
            "n_hedged": 0,
            "n_unparseable": 4,
            "hallucination_rate": 0.5,
            "accuracy_among_assertions": 0.5,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      },
      "gemini-2.5-pro": {
        "n": 12,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 12,
        "accuracy": 0.8333333333333334,
        "accuracy_among_scored": 0.8333333333333334,
        "false_negative_rate": 0.16666666666666666,
        "false_positive_rate": null,
        "hedge_rate": 0.0,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.4166666666666667,
        "confidence_brier_score": 0.22982499999999997,
        "counts": {
          "correct": 10,
          "false_negative": 2,
          "false_positive": 0,
          "hedge": 0,
          "unparseable": 0,
          "hallucinated": 5,
          "excluded": 0,
          "n_risky_truth": 12,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.16666666666666666,
          "domain": 0.5,
          "handle": 0.25,
          "cultural": 0.125,
          "sound": 0.5
        },
        "surface_detail": {
          "trademark": {
            "n": 12,
            "n_confident_assertions": 12,
            "n_hedged": 0,
            "n_unparseable": 0,
            "hallucination_rate": 0.16666666666666666,
            "accuracy_among_assertions": 0.8333333333333334,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 12,
            "n_confident_assertions": 8,
            "n_hedged": 0,
            "n_unparseable": 4,
            "hallucination_rate": 0.5,
            "accuracy_among_assertions": 0.5,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 12,
            "n_confident_assertions": 4,
            "n_hedged": 4,
            "n_unparseable": 4,
            "hallucination_rate": 0.25,
            "accuracy_among_assertions": 0.75,
            "hedge_rate": 0.3333333333333333,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 12,
            "n_confident_assertions": 8,
            "n_hedged": 0,
            "n_unparseable": 4,
            "hallucination_rate": 0.125,
            "accuracy_among_assertions": 0.875,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 12,
            "n_confident_assertions": 8,
            "n_hedged": 0,
            "n_unparseable": 4,
            "hallucination_rate": 0.5,
            "accuracy_among_assertions": 0.5,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      },
      "gpt-4o-mini": {
        "n": 12,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 12,
        "accuracy": 0.6666666666666666,
        "accuracy_among_scored": 0.6666666666666666,
        "false_negative_rate": 0.16666666666666666,
        "false_positive_rate": null,
        "hedge_rate": 0.16666666666666666,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.16666666666666666,
        "confidence_brier_score": 0.33327142857142855,
        "counts": {
          "correct": 8,
          "false_negative": 2,
          "false_positive": 0,
          "hedge": 2,
          "unparseable": 0,
          "hallucinated": 2,
          "excluded": 0,
          "n_risky_truth": 12,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.2,
          "domain": 0.375,
          "handle": 0.25,
          "cultural": 0.25,
          "sound": 0.5
        },
        "surface_detail": {
          "trademark": {
            "n": 12,
            "n_confident_assertions": 10,
            "n_hedged": 2,
            "n_unparseable": 0,
            "hallucination_rate": 0.2,
            "accuracy_among_assertions": 0.8,
            "hedge_rate": 0.16666666666666666,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 12,
            "n_confident_assertions": 8,
            "n_hedged": 0,
            "n_unparseable": 4,
            "hallucination_rate": 0.375,
            "accuracy_among_assertions": 0.625,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 12,
            "n_confident_assertions": 4,
            "n_hedged": 4,
            "n_unparseable": 4,
            "hallucination_rate": 0.25,
            "accuracy_among_assertions": 0.75,
            "hedge_rate": 0.3333333333333333,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 12,
            "n_confident_assertions": 8,
            "n_hedged": 0,
            "n_unparseable": 4,
            "hallucination_rate": 0.25,
            "accuracy_among_assertions": 0.75,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 12,
            "n_confident_assertions": 8,
            "n_hedged": 0,
            "n_unparseable": 4,
            "hallucination_rate": 0.5,
            "accuracy_among_assertions": 0.5,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      }
    },
    "phonetic_neighbor_famous": {
      "claude-opus-4-7": {
        "n": 108,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 108,
        "accuracy": 0.7222222222222222,
        "accuracy_among_scored": 0.7222222222222222,
        "false_negative_rate": 0.009259259259259259,
        "false_positive_rate": null,
        "hedge_rate": 0.26851851851851855,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.018518518518518517,
        "confidence_brier_score": 0.14255925925925925,
        "counts": {
          "correct": 78,
          "false_negative": 1,
          "false_positive": 0,
          "hedge": 29,
          "unparseable": 0,
          "hallucinated": 2,
          "excluded": 0,
          "n_risky_truth": 108,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.012658227848101266,
          "domain": 0.918918918918919,
          "handle": null,
          "cultural": 0.3888888888888889,
          "sound": 0.3472222222222222
        },
        "surface_detail": {
          "trademark": {
            "n": 108,
            "n_confident_assertions": 79,
            "n_hedged": 29,
            "n_unparseable": 0,
            "hallucination_rate": 0.012658227848101266,
            "accuracy_among_assertions": 0.9873417721518988,
            "hedge_rate": 0.26851851851851855,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 108,
            "n_confident_assertions": 37,
            "n_hedged": 35,
            "n_unparseable": 36,
            "hallucination_rate": 0.918918918918919,
            "accuracy_among_assertions": 0.08108108108108109,
            "hedge_rate": 0.32407407407407407,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 108,
            "n_confident_assertions": 0,
            "n_hedged": 72,
            "n_unparseable": 36,
            "hallucination_rate": null,
            "accuracy_among_assertions": null,
            "hedge_rate": 0.6666666666666666,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 108,
            "n_confident_assertions": 72,
            "n_hedged": 0,
            "n_unparseable": 36,
            "hallucination_rate": 0.3888888888888889,
            "accuracy_among_assertions": 0.6111111111111112,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 108,
            "n_confident_assertions": 72,
            "n_hedged": 0,
            "n_unparseable": 36,
            "hallucination_rate": 0.3472222222222222,
            "accuracy_among_assertions": 0.6527777777777778,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      },
      "gemini-2.5-pro": {
        "n": 108,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 108,
        "accuracy": 0.8148148148148148,
        "accuracy_among_scored": 0.8148148148148148,
        "false_negative_rate": 0.1574074074074074,
        "false_positive_rate": null,
        "hedge_rate": 0.009259259259259259,
        "unparseable_rate": 0.018518518518518517,
        "hallucination_rate": 0.35185185185185186,
        "confidence_brier_score": 0.3309155339805825,
        "counts": {
          "correct": 88,
          "false_negative": 17,
          "false_positive": 0,
          "hedge": 1,
          "unparseable": 2,
          "hallucinated": 38,
          "excluded": 0,
          "n_risky_truth": 108,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.1619047619047619,
          "domain": 0.7878787878787878,
          "handle": 0.14285714285714285,
          "cultural": 0.1,
          "sound": 0.3142857142857143
        },
        "surface_detail": {
          "trademark": {
            "n": 108,
            "n_confident_assertions": 105,
            "n_hedged": 1,
            "n_unparseable": 2,
            "hallucination_rate": 0.1619047619047619,
            "accuracy_among_assertions": 0.8380952380952381,
            "hedge_rate": 0.009259259259259259,
            "unparseable_rate": 0.018518518518518517
          },
          "domain": {
            "n": 108,
            "n_confident_assertions": 66,
            "n_hedged": 4,
            "n_unparseable": 38,
            "hallucination_rate": 0.7878787878787878,
            "accuracy_among_assertions": 0.21212121212121213,
            "hedge_rate": 0.037037037037037035,
            "unparseable_rate": 0.35185185185185186
          },
          "handle": {
            "n": 108,
            "n_confident_assertions": 35,
            "n_hedged": 35,
            "n_unparseable": 38,
            "hallucination_rate": 0.14285714285714285,
            "accuracy_among_assertions": 0.8571428571428571,
            "hedge_rate": 0.32407407407407407,
            "unparseable_rate": 0.35185185185185186
          },
          "cultural": {
            "n": 108,
            "n_confident_assertions": 70,
            "n_hedged": 0,
            "n_unparseable": 38,
            "hallucination_rate": 0.1,
            "accuracy_among_assertions": 0.9,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.35185185185185186
          },
          "sound": {
            "n": 108,
            "n_confident_assertions": 70,
            "n_hedged": 0,
            "n_unparseable": 38,
            "hallucination_rate": 0.3142857142857143,
            "accuracy_among_assertions": 0.6857142857142857,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.35185185185185186
          }
        }
      },
      "gpt-4o-mini": {
        "n": 108,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 108,
        "accuracy": 0.6481481481481481,
        "accuracy_among_scored": 0.6481481481481481,
        "false_negative_rate": 0.21296296296296297,
        "false_positive_rate": null,
        "hedge_rate": 0.1388888888888889,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.018518518518518517,
        "confidence_brier_score": 0.3955071428571429,
        "counts": {
          "correct": 70,
          "false_negative": 23,
          "false_positive": 0,
          "hedge": 15,
          "unparseable": 0,
          "hallucinated": 2,
          "excluded": 0,
          "n_risky_truth": 108,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.24731182795698925,
          "domain": 0.9090909090909091,
          "handle": 0.2,
          "cultural": 0.1111111111111111,
          "sound": 0.2777777777777778
        },
        "surface_detail": {
          "trademark": {
            "n": 108,
            "n_confident_assertions": 93,
            "n_hedged": 15,
            "n_unparseable": 0,
            "hallucination_rate": 0.24731182795698925,
            "accuracy_among_assertions": 0.7526881720430108,
            "hedge_rate": 0.1388888888888889,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 108,
            "n_confident_assertions": 66,
            "n_hedged": 6,
            "n_unparseable": 36,
            "hallucination_rate": 0.9090909090909091,
            "accuracy_among_assertions": 0.09090909090909091,
            "hedge_rate": 0.05555555555555555,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 108,
            "n_confident_assertions": 30,
            "n_hedged": 42,
            "n_unparseable": 36,
            "hallucination_rate": 0.2,
            "accuracy_among_assertions": 0.8,
            "hedge_rate": 0.3888888888888889,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 108,
            "n_confident_assertions": 72,
            "n_hedged": 0,
            "n_unparseable": 36,
            "hallucination_rate": 0.1111111111111111,
            "accuracy_among_assertions": 0.8888888888888888,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 108,
            "n_confident_assertions": 72,
            "n_hedged": 0,
            "n_unparseable": 36,
            "hallucination_rate": 0.2777777777777778,
            "accuracy_among_assertions": 0.7222222222222222,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      }
    },
    "recent_micro_startup": {
      "claude-opus-4-7": {
        "n": 30,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 30,
        "accuracy": 0.7666666666666667,
        "accuracy_among_scored": 0.7666666666666667,
        "false_negative_rate": 0.06666666666666667,
        "false_positive_rate": null,
        "hedge_rate": 0.16666666666666666,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.13333333333333333,
        "confidence_brier_score": 0.2579947368421053,
        "counts": {
          "correct": 23,
          "false_negative": 2,
          "false_positive": 0,
          "hedge": 5,
          "unparseable": 0,
          "hallucinated": 4,
          "excluded": 0,
          "n_risky_truth": 30,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.08,
          "domain": 0.0,
          "handle": 0.0,
          "cultural": 0.15,
          "sound": 0.25
        },
        "surface_detail": {
          "trademark": {
            "n": 30,
            "n_confident_assertions": 25,
            "n_hedged": 5,
            "n_unparseable": 0,
            "hallucination_rate": 0.08,
            "accuracy_among_assertions": 0.92,
            "hedge_rate": 0.16666666666666666,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 30,
            "n_confident_assertions": 18,
            "n_hedged": 2,
            "n_unparseable": 10,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.06666666666666667,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 30,
            "n_confident_assertions": 4,
            "n_hedged": 16,
            "n_unparseable": 10,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.5333333333333333,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 30,
            "n_confident_assertions": 20,
            "n_hedged": 0,
            "n_unparseable": 10,
            "hallucination_rate": 0.15,
            "accuracy_among_assertions": 0.85,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 30,
            "n_confident_assertions": 20,
            "n_hedged": 0,
            "n_unparseable": 10,
            "hallucination_rate": 0.25,
            "accuracy_among_assertions": 0.75,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      },
      "gemini-2.5-pro": {
        "n": 30,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 30,
        "accuracy": 0.6666666666666666,
        "accuracy_among_scored": 0.6666666666666666,
        "false_negative_rate": 0.16666666666666666,
        "false_positive_rate": null,
        "hedge_rate": 0.03333333333333333,
        "unparseable_rate": 0.13333333333333333,
        "hallucination_rate": 0.4666666666666667,
        "confidence_brier_score": 0.41672799999999993,
        "counts": {
          "correct": 20,
          "false_negative": 5,
          "false_positive": 0,
          "hedge": 1,
          "unparseable": 4,
          "hallucinated": 14,
          "excluded": 0,
          "n_risky_truth": 30,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.2,
          "domain": 0.11764705882352941,
          "handle": 0.2222222222222222,
          "cultural": 0.17647058823529413,
          "sound": 0.29411764705882354
        },
        "surface_detail": {
          "trademark": {
            "n": 30,
            "n_confident_assertions": 25,
            "n_hedged": 1,
            "n_unparseable": 4,
            "hallucination_rate": 0.2,
            "accuracy_among_assertions": 0.8,
            "hedge_rate": 0.03333333333333333,
            "unparseable_rate": 0.13333333333333333
          },
          "domain": {
            "n": 30,
            "n_confident_assertions": 17,
            "n_hedged": 0,
            "n_unparseable": 13,
            "hallucination_rate": 0.11764705882352941,
            "accuracy_among_assertions": 0.8823529411764706,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.43333333333333335
          },
          "handle": {
            "n": 30,
            "n_confident_assertions": 9,
            "n_hedged": 8,
            "n_unparseable": 13,
            "hallucination_rate": 0.2222222222222222,
            "accuracy_among_assertions": 0.7777777777777778,
            "hedge_rate": 0.26666666666666666,
            "unparseable_rate": 0.43333333333333335
          },
          "cultural": {
            "n": 30,
            "n_confident_assertions": 17,
            "n_hedged": 0,
            "n_unparseable": 13,
            "hallucination_rate": 0.17647058823529413,
            "accuracy_among_assertions": 0.8235294117647058,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.43333333333333335
          },
          "sound": {
            "n": 30,
            "n_confident_assertions": 17,
            "n_hedged": 0,
            "n_unparseable": 13,
            "hallucination_rate": 0.29411764705882354,
            "accuracy_among_assertions": 0.7058823529411765,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.43333333333333335
          }
        }
      },
      "gpt-4o-mini": {
        "n": 30,
        "n_excluded_max_tokens_truncated": 0,
        "n_scored": 30,
        "accuracy": 0.7,
        "accuracy_among_scored": 0.7,
        "false_negative_rate": 0.13333333333333333,
        "false_positive_rate": null,
        "hedge_rate": 0.16666666666666666,
        "unparseable_rate": 0.0,
        "hallucination_rate": 0.16666666666666666,
        "confidence_brier_score": 0.275,
        "counts": {
          "correct": 21,
          "false_negative": 4,
          "false_positive": 0,
          "hedge": 5,
          "unparseable": 0,
          "hallucinated": 5,
          "excluded": 0,
          "n_risky_truth": 30,
          "n_safe_truth": 0
        },
        "surface_hallucination_rate": {
          "trademark": 0.16,
          "domain": 0.15,
          "handle": 0.375,
          "cultural": 0.0,
          "sound": 0.3
        },
        "surface_detail": {
          "trademark": {
            "n": 30,
            "n_confident_assertions": 25,
            "n_hedged": 5,
            "n_unparseable": 0,
            "hallucination_rate": 0.16,
            "accuracy_among_assertions": 0.84,
            "hedge_rate": 0.16666666666666666,
            "unparseable_rate": 0.0
          },
          "domain": {
            "n": 30,
            "n_confident_assertions": 20,
            "n_hedged": 0,
            "n_unparseable": 10,
            "hallucination_rate": 0.15,
            "accuracy_among_assertions": 0.85,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "handle": {
            "n": 30,
            "n_confident_assertions": 8,
            "n_hedged": 12,
            "n_unparseable": 10,
            "hallucination_rate": 0.375,
            "accuracy_among_assertions": 0.625,
            "hedge_rate": 0.4,
            "unparseable_rate": 0.3333333333333333
          },
          "cultural": {
            "n": 30,
            "n_confident_assertions": 20,
            "n_hedged": 0,
            "n_unparseable": 10,
            "hallucination_rate": 0.0,
            "accuracy_among_assertions": 1.0,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          },
          "sound": {
            "n": 30,
            "n_confident_assertions": 20,
            "n_hedged": 0,
            "n_unparseable": 10,
            "hallucination_rate": 0.3,
            "accuracy_among_assertions": 0.7,
            "hedge_rate": 0.0,
            "unparseable_rate": 0.3333333333333333
          }
        }
      }
    }
  },
  "etymolt_accuracy": {
    "trademark": {
      "n": 245,
      "accuracy": 0.5510204081632653
    },
    "domain": {
      "n": 245,
      "accuracy": 0.8
    },
    "handle": {
      "n": 245,
      "accuracy": 1.0
    },
    "cultural": {
      "n": 245,
      "accuracy": 1.0
    },
    "sound": {
      "n": 245,
      "accuracy": 0.8244897959183674
    }
  },
  "results_csv_sha256": "0f9939caae77424feb8f3e0db69eab087861ae97cce26016f64d3606d5005edc"
}
