{
  "version": "1.0",
  "categories": [
    {"id":"language-communication","name":"Language & Communication","type":"capability"},
    {"id":"social-intelligence","name":"Social Intelligence & Interaction","type":"capability"},
    {"id":"problem-solving","name":"Problem Solving","type":"capability"},
    {"id":"creativity-innovation","name":"Creativity & Innovation","type":"capability"},
    {"id":"learning-memory","name":"Learning & Memory","type":"capability"},
    {"id":"perception-vision","name":"Perception & Vision","type":"capability"},
    {"id":"physical-manipulation","name":"Physical Manipulation & Motor Skills","type":"capability"},
    {"id":"metacognition","name":"Metacognition & Self-Awareness","type":"capability"},
    {"id":"robotic-intelligence","name":"Robotic Intelligence & Autonomy","type":"capability"},
    {"id":"harmful-content","name":"Harmful Content Generation","type":"risk"},
    {"id":"information-integrity","name":"Information Integrity & Misinformation","type":"risk"},
    {"id":"privacy-data","name":"Privacy & Data Protection","type":"risk"},
    {"id":"bias-fairness","name":"Bias & Fairness","type":"risk"},
    {"id":"security-robustness","name":"Security & Robustness","type":"risk"},
    {"id":"dangerous-capabilities","name":"Dangerous Capabilities & Misuse","type":"risk"},
    {"id":"human-ai-interaction","name":"Human-AI Interaction Risks","type":"risk"},
    {"id":"environmental-impact","name":"Environmental & Resource Impact","type":"risk"},
    {"id":"economic-displacement","name":"Economic & Labor Displacement","type":"risk"},
    {"id":"governance-accountability","name":"Governance & Accountability","type":"risk"},
    {"id":"value-chain","name":"Value Chain & Supply Chain Risks","type":"risk"}
  ],
  "benchmarkQuestions": [
    {
      "id":"A1",
      "text":"Has the system been run on recognized, category-specific benchmarks?",
      "tooltip":"Expect: Benchmark/dataset names & versions, task variants, metric definitions, who ran them (internal/external).",
      "hint":"List benchmarks, dataset versions and who executed them."
    },
    {
      "id":"A2",
      "text":"Does the system meet pre-set quantitative thresholds for acceptable performance under applicable regulations?",
      "tooltip":"Expect: Numeric scores vs. regulatory/compliance thresholds (e.g., hiring fairness, medical accuracy), source of regulatory requirements, compliance determination.",
      "hint":"Provide numeric scores and regulatory mappings."
    },
    {
      "id":"A3",
      "text":"Has performance been compared to baselines, SOTA, previous versions, and other comparable systems?",
      "tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.",
      "hint":"Provide comparative scores and targets."
    },
    {
      "id":"A4",
      "text":"Has the system been tested under adversarial inputs, extreme loads, or distribution shift?",
      "tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.",
      "hint":"Describe stress tests and observed failure rates."
    },
    {
      "id":"A5",
      "text":"Is performance measured in the wild with automated monitors?",
      "tooltip":"Expect: Live metrics tracked (e.g., error rates, drift, latency), sampling cadence, alert thresholds.",
      "hint":"List live metrics and alerting rules."
    },
    {
      "id":"A6",
      "text":"Have you quantified train–test overlap or leakage risks that could inflate results?",
      "tooltip":"Expect: Procedure (e.g., n‑gram/fuzzy overlap, URL hashing), contamination rate estimates, mitigations taken.",
      "hint":"Report contamination checks and mitigation steps."
    }
  ],
  "processQuestions": [
    {
      "id":"B1",
      "text":"Are the capability/risk claims and applicability for this category clearly documented?",
      "tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.",
      "hint":"Define scope and hypotheses."
    },
    {
      "id":"B2",
      "text":"Can others reproduce the results?",
      "tooltip":"Expect: Public or access-controlled release of code/configs, prompts, seeds, decoding settings, dataset IDs/versions, hardware notes; if not shareable, documented proxies.",
      "hint":"Point to code, data or proxies to reproduce results."
    },
    {
      "id":"B3",
      "text":"Have domain experts/affected users reviewed interpretations of results?",
      "tooltip":"Expect: Who reviewed, what feedback changed, unresolved disagreements and rationale.",
      "hint":"List reviewers and key feedback."
    },
    {
      "id":"B4",
      "text":"Do figures communicate results without distortion and with uncertainty/context?",
      "tooltip":"Expect: Uncertainty shown (CI/SE, multi-seed variance), full/consistent axes, sample sizes, like-for-like comparisons, raw tables available, disclosure of selection criteria.",
      "hint":"Ensure figures include uncertainty disclosures."
    },
    {
      "id":"B5",
      "text":"Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?",
      "tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.",
      "hint":"Map evaluation practices to standards and note gaps."
    },
    {
      "id":"B6",
      "text":"Is there a process to re-run/adapt evals as models, data, or risks change, including mitigation and retest procedures?",
      "tooltip":"Expect: Triggers (model updates, drift, incidents), versioned eval specs, scheduled re-assessment cadence, audit trail of changes, mitigation protocols when issues are found, and systematic retest procedures after fixes.",
      "hint":"Describe triggers and re-evaluation procedures."
    }
  ],
  "benchmarkSourceFields": [
    {"name":"benchmarkName","label":"Benchmark/Dataset Name","type":"text","placeholder":"e.g., MMLU v1"},
    {"name":"version","label":"Version","type":"text","placeholder":"e.g., v1.2"},
    {"name":"taskVariants","label":"Task Variants","type":"text","placeholder":"e.g., multiple choice, generation"},
    {"name":"metrics","label":"Metrics","type":"text","placeholder":"e.g., accuracy, F1"},
    {"name":"url","label":"URL","type":"text","placeholder":"https://..."},
    {"name":"description","label":"Description","type":"textarea","placeholder":"Describe the benchmark or test"},
    {"name":"sourceType","label":"Source Type","type":"radio","options":[{"value":"internal","label":"Internal"},{"value":"external","label":"External"},{"value":"cooperative","label":"Cooperative"}]},
    {"name":"score","label":"Score","type":"text","placeholder":"e.g., 85%"},
    {"name":"confidenceInterval","label":"Confidence Interval","type":"text","placeholder":"e.g., 95% CI [90,94]"}
  ],
  "processSourceFields": [
    {"name":"url","label":"URL","type":"text","placeholder":"https://..."},
    {"name":"documentType","label":"Document Type","type":"text","placeholder":"e.g., Policy, Procedure"},
    {"name":"title","label":"Title","type":"text","placeholder":"Document title"},
    {"name":"author","label":"Author","type":"text","placeholder":"Author or owner"},
    {"name":"organization","label":"Organization","type":"text","placeholder":"Owning org"},
    {"name":"date","label":"Date","type":"text","placeholder":"YYYY-MM-DD"},
    {"name":"description","label":"Description","type":"textarea","placeholder":"Describe the documentation"}
  ]
}