Spaces:
Running
Running
| { | |
| "version": "1.0", | |
| "categories": [ | |
| {"id":"language-communication","name":"Language & Communication","type":"capability"}, | |
| {"id":"social-intelligence","name":"Social Intelligence & Interaction","type":"capability"}, | |
| {"id":"problem-solving","name":"Problem Solving","type":"capability"}, | |
| {"id":"creativity-innovation","name":"Creativity & Innovation","type":"capability"}, | |
| {"id":"learning-memory","name":"Learning & Memory","type":"capability"}, | |
| {"id":"perception-vision","name":"Perception & Vision","type":"capability"}, | |
| {"id":"physical-manipulation","name":"Physical Manipulation & Motor Skills","type":"capability"}, | |
| {"id":"metacognition","name":"Metacognition & Self-Awareness","type":"capability"}, | |
| {"id":"robotic-intelligence","name":"Robotic Intelligence & Autonomy","type":"capability"}, | |
| {"id":"harmful-content","name":"Harmful Content Generation","type":"risk"}, | |
| {"id":"information-integrity","name":"Information Integrity & Misinformation","type":"risk"}, | |
| {"id":"privacy-data","name":"Privacy & Data Protection","type":"risk"}, | |
| {"id":"bias-fairness","name":"Bias & Fairness","type":"risk"}, | |
| {"id":"security-robustness","name":"Security & Robustness","type":"risk"}, | |
| {"id":"dangerous-capabilities","name":"Dangerous Capabilities & Misuse","type":"risk"}, | |
| {"id":"human-ai-interaction","name":"Human-AI Interaction Risks","type":"risk"}, | |
| {"id":"environmental-impact","name":"Environmental & Resource Impact","type":"risk"}, | |
| {"id":"economic-displacement","name":"Economic & Labor Displacement","type":"risk"}, | |
| {"id":"governance-accountability","name":"Governance & Accountability","type":"risk"}, | |
| {"id":"value-chain","name":"Value Chain & Supply Chain Risks","type":"risk"} | |
| ], | |
| "benchmarkQuestions": [ | |
| { | |
| "id":"A1", | |
| "text":"Has the system been run on recognized, category-specific benchmarks?", | |
| "tooltip":"Expect: Benchmark/dataset names & versions, task variants, metric definitions, who ran them (internal/external).", | |
| "hint":"List benchmarks, dataset versions and who executed them." | |
| }, | |
| { | |
| "id":"A2", | |
| "text":"Does the system meet pre-set quantitative thresholds for acceptable performance under applicable regulations?", | |
| "tooltip":"Expect: Numeric scores vs. regulatory/compliance thresholds (e.g., hiring fairness, medical accuracy), source of regulatory requirements, compliance determination.", | |
| "hint":"Provide numeric scores and regulatory mappings." | |
| }, | |
| { | |
| "id":"A3", | |
| "text":"Has performance been compared to baselines, SOTA, previous versions, and other comparable systems?", | |
| "tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.", | |
| "hint":"Provide comparative scores and targets." | |
| }, | |
| { | |
| "id":"A4", | |
| "text":"Has the system been tested under adversarial inputs, extreme loads, or distribution shift?", | |
| "tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.", | |
| "hint":"Describe stress tests and observed failure rates." | |
| }, | |
| { | |
| "id":"A5", | |
| "text":"Is performance measured in the wild with automated monitors?", | |
| "tooltip":"Expect: Live metrics tracked (e.g., error rates, drift, latency), sampling cadence, alert thresholds.", | |
| "hint":"List live metrics and alerting rules." | |
| }, | |
| { | |
| "id":"A6", | |
| "text":"Have you quantified train–test overlap or leakage risks that could inflate results?", | |
| "tooltip":"Expect: Procedure (e.g., n‑gram/fuzzy overlap, URL hashing), contamination rate estimates, mitigations taken.", | |
| "hint":"Report contamination checks and mitigation steps." | |
| } | |
| ], | |
| "processQuestions": [ | |
| { | |
| "id":"B1", | |
| "text":"Are the capability/risk claims and applicability for this category clearly documented?", | |
| "tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.", | |
| "hint":"Define scope and hypotheses." | |
| }, | |
| { | |
| "id":"B2", | |
| "text":"Can others reproduce the results?", | |
| "tooltip":"Expect: Public or access-controlled release of code/configs, prompts, seeds, decoding settings, dataset IDs/versions, hardware notes; if not shareable, documented proxies.", | |
| "hint":"Point to code, data or proxies to reproduce results." | |
| }, | |
| { | |
| "id":"B3", | |
| "text":"Have domain experts/affected users reviewed interpretations of results?", | |
| "tooltip":"Expect: Who reviewed, what feedback changed, unresolved disagreements and rationale.", | |
| "hint":"List reviewers and key feedback." | |
| }, | |
| { | |
| "id":"B4", | |
| "text":"Do figures communicate results without distortion and with uncertainty/context?", | |
| "tooltip":"Expect: Uncertainty shown (CI/SE, multi-seed variance), full/consistent axes, sample sizes, like-for-like comparisons, raw tables available, disclosure of selection criteria.", | |
| "hint":"Ensure figures include uncertainty disclosures." | |
| }, | |
| { | |
| "id":"B5", | |
| "text":"Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?", | |
| "tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.", | |
| "hint":"Map evaluation practices to standards and note gaps." | |
| }, | |
| { | |
| "id":"B6", | |
| "text":"Is there a process to re-run/adapt evals as models, data, or risks change, including mitigation and retest procedures?", | |
| "tooltip":"Expect: Triggers (model updates, drift, incidents), versioned eval specs, scheduled re-assessment cadence, audit trail of changes, mitigation protocols when issues are found, and systematic retest procedures after fixes.", | |
| "hint":"Describe triggers and re-evaluation procedures." | |
| } | |
| ], | |
| "benchmarkSourceFields": [ | |
| {"name":"benchmarkName","label":"Benchmark/Dataset Name","type":"text","placeholder":"e.g., MMLU v1"}, | |
| {"name":"version","label":"Version","type":"text","placeholder":"e.g., v1.2"}, | |
| {"name":"taskVariants","label":"Task Variants","type":"text","placeholder":"e.g., multiple choice, generation"}, | |
| {"name":"metrics","label":"Metrics","type":"text","placeholder":"e.g., accuracy, F1"}, | |
| {"name":"url","label":"URL","type":"text","placeholder":"https://..."}, | |
| {"name":"description","label":"Description","type":"textarea","placeholder":"Describe the benchmark or test"}, | |
| {"name":"sourceType","label":"Source Type","type":"radio","options":[{"value":"internal","label":"Internal"},{"value":"external","label":"External"},{"value":"cooperative","label":"Cooperative"}]}, | |
| {"name":"score","label":"Score","type":"text","placeholder":"e.g., 85%"}, | |
| {"name":"confidenceInterval","label":"Confidence Interval","type":"text","placeholder":"e.g., 95% CI [90,94]"} | |
| ], | |
| "processSourceFields": [ | |
| {"name":"url","label":"URL","type":"text","placeholder":"https://..."}, | |
| {"name":"documentType","label":"Document Type","type":"text","placeholder":"e.g., Policy, Procedure"}, | |
| {"name":"title","label":"Title","type":"text","placeholder":"Document title"}, | |
| {"name":"author","label":"Author","type":"text","placeholder":"Author or owner"}, | |
| {"name":"organization","label":"Organization","type":"text","placeholder":"Owning org"}, | |
| {"name":"date","label":"Date","type":"text","placeholder":"YYYY-MM-DD"}, | |
| {"name":"description","label":"Description","type":"textarea","placeholder":"Describe the documentation"} | |
| ] | |
| } | |