{ "version": "1.0", "categories": [ {"id":"language-communication","name":"Language & Communication","type":"capability"}, {"id":"social-intelligence","name":"Social Intelligence & Interaction","type":"capability"}, {"id":"problem-solving","name":"Problem Solving","type":"capability"}, {"id":"creativity-innovation","name":"Creativity & Innovation","type":"capability"}, {"id":"learning-memory","name":"Learning & Memory","type":"capability"}, {"id":"perception-vision","name":"Perception & Vision","type":"capability"}, {"id":"physical-manipulation","name":"Physical Manipulation & Motor Skills","type":"capability"}, {"id":"metacognition","name":"Metacognition & Self-Awareness","type":"capability"}, {"id":"robotic-intelligence","name":"Robotic Intelligence & Autonomy","type":"capability"}, {"id":"harmful-content","name":"Harmful Content Generation","type":"risk"}, {"id":"information-integrity","name":"Information Integrity & Misinformation","type":"risk"}, {"id":"privacy-data","name":"Privacy & Data Protection","type":"risk"}, {"id":"bias-fairness","name":"Bias & Fairness","type":"risk"}, {"id":"security-robustness","name":"Security & Robustness","type":"risk"}, {"id":"dangerous-capabilities","name":"Dangerous Capabilities & Misuse","type":"risk"}, {"id":"human-ai-interaction","name":"Human-AI Interaction Risks","type":"risk"}, {"id":"environmental-impact","name":"Environmental & Resource Impact","type":"risk"}, {"id":"economic-displacement","name":"Economic & Labor Displacement","type":"risk"}, {"id":"governance-accountability","name":"Governance & Accountability","type":"risk"}, {"id":"value-chain","name":"Value Chain & Supply Chain Risks","type":"risk"} ], "benchmarkQuestions": [ { "id":"A1", "text":"Has the system been run on recognized, category-specific benchmarks?", "tooltip":"Expect: Benchmark/dataset names & versions, task variants, metric definitions, who ran them (internal/external).", "hint":"List benchmarks, dataset versions and who executed them." }, { "id":"A2", "text":"Does the system meet pre-set quantitative thresholds for acceptable performance under applicable regulations?", "tooltip":"Expect: Numeric scores vs. regulatory/compliance thresholds (e.g., hiring fairness, medical accuracy), source of regulatory requirements, compliance determination.", "hint":"Provide numeric scores and regulatory mappings." }, { "id":"A3", "text":"Has performance been compared to baselines, SOTA, previous versions, and other comparable systems?", "tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.", "hint":"Provide comparative scores and targets." }, { "id":"A4", "text":"Has the system been tested under adversarial inputs, extreme loads, or distribution shift?", "tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.", "hint":"Describe stress tests and observed failure rates." }, { "id":"A5", "text":"Is performance measured in the wild with automated monitors?", "tooltip":"Expect: Live metrics tracked (e.g., error rates, drift, latency), sampling cadence, alert thresholds.", "hint":"List live metrics and alerting rules." }, { "id":"A6", "text":"Have you quantified train–test overlap or leakage risks that could inflate results?", "tooltip":"Expect: Procedure (e.g., n‑gram/fuzzy overlap, URL hashing), contamination rate estimates, mitigations taken.", "hint":"Report contamination checks and mitigation steps." } ], "processQuestions": [ { "id":"B1", "text":"Are the capability/risk claims and applicability for this category clearly documented?", "tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.", "hint":"Define scope and hypotheses." }, { "id":"B2", "text":"Can others reproduce the results?", "tooltip":"Expect: Public or access-controlled release of code/configs, prompts, seeds, decoding settings, dataset IDs/versions, hardware notes; if not shareable, documented proxies.", "hint":"Point to code, data or proxies to reproduce results." }, { "id":"B3", "text":"Have domain experts/affected users reviewed interpretations of results?", "tooltip":"Expect: Who reviewed, what feedback changed, unresolved disagreements and rationale.", "hint":"List reviewers and key feedback." }, { "id":"B4", "text":"Do figures communicate results without distortion and with uncertainty/context?", "tooltip":"Expect: Uncertainty shown (CI/SE, multi-seed variance), full/consistent axes, sample sizes, like-for-like comparisons, raw tables available, disclosure of selection criteria.", "hint":"Ensure figures include uncertainty disclosures." }, { "id":"B5", "text":"Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?", "tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.", "hint":"Map evaluation practices to standards and note gaps." }, { "id":"B6", "text":"Is there a process to re-run/adapt evals as models, data, or risks change, including mitigation and retest procedures?", "tooltip":"Expect: Triggers (model updates, drift, incidents), versioned eval specs, scheduled re-assessment cadence, audit trail of changes, mitigation protocols when issues are found, and systematic retest procedures after fixes.", "hint":"Describe triggers and re-evaluation procedures." } ], "benchmarkSourceFields": [ {"name":"benchmarkName","label":"Benchmark/Dataset Name","type":"text","placeholder":"e.g., MMLU v1"}, {"name":"version","label":"Version","type":"text","placeholder":"e.g., v1.2"}, {"name":"taskVariants","label":"Task Variants","type":"text","placeholder":"e.g., multiple choice, generation"}, {"name":"metrics","label":"Metrics","type":"text","placeholder":"e.g., accuracy, F1"}, {"name":"url","label":"URL","type":"text","placeholder":"https://..."}, {"name":"description","label":"Description","type":"textarea","placeholder":"Describe the benchmark or test"}, {"name":"sourceType","label":"Source Type","type":"radio","options":[{"value":"internal","label":"Internal"},{"value":"external","label":"External"},{"value":"cooperative","label":"Cooperative"}]}, {"name":"score","label":"Score","type":"text","placeholder":"e.g., 85%"}, {"name":"confidenceInterval","label":"Confidence Interval","type":"text","placeholder":"e.g., 95% CI [90,94]"} ], "processSourceFields": [ {"name":"url","label":"URL","type":"text","placeholder":"https://..."}, {"name":"documentType","label":"Document Type","type":"text","placeholder":"e.g., Policy, Procedure"}, {"name":"title","label":"Title","type":"text","placeholder":"Document title"}, {"name":"author","label":"Author","type":"text","placeholder":"Author or owner"}, {"name":"organization","label":"Organization","type":"text","placeholder":"Owning org"}, {"name":"date","label":"Date","type":"text","placeholder":"YYYY-MM-DD"}, {"name":"description","label":"Description","type":"textarea","placeholder":"Describe the documentation"} ] }