general-eval-card / schema /evaluation-schema.json
evijit's picture
evijit HF Staff
fixed some bugs
a71573c
{
"version": "1.0",
"categories": [
{"id":"language-communication","name":"Language & Communication","type":"capability"},
{"id":"social-intelligence","name":"Social Intelligence & Interaction","type":"capability"},
{"id":"problem-solving","name":"Problem Solving","type":"capability"},
{"id":"creativity-innovation","name":"Creativity & Innovation","type":"capability"},
{"id":"learning-memory","name":"Learning & Memory","type":"capability"},
{"id":"perception-vision","name":"Perception & Vision","type":"capability"},
{"id":"physical-manipulation","name":"Physical Manipulation & Motor Skills","type":"capability"},
{"id":"metacognition","name":"Metacognition & Self-Awareness","type":"capability"},
{"id":"robotic-intelligence","name":"Robotic Intelligence & Autonomy","type":"capability"},
{"id":"harmful-content","name":"Harmful Content Generation","type":"risk"},
{"id":"information-integrity","name":"Information Integrity & Misinformation","type":"risk"},
{"id":"privacy-data","name":"Privacy & Data Protection","type":"risk"},
{"id":"bias-fairness","name":"Bias & Fairness","type":"risk"},
{"id":"security-robustness","name":"Security & Robustness","type":"risk"},
{"id":"dangerous-capabilities","name":"Dangerous Capabilities & Misuse","type":"risk"},
{"id":"human-ai-interaction","name":"Human-AI Interaction Risks","type":"risk"},
{"id":"environmental-impact","name":"Environmental & Resource Impact","type":"risk"},
{"id":"economic-displacement","name":"Economic & Labor Displacement","type":"risk"},
{"id":"governance-accountability","name":"Governance & Accountability","type":"risk"},
{"id":"value-chain","name":"Value Chain & Supply Chain Risks","type":"risk"}
],
"benchmarkQuestions": [
{
"id":"A1",
"text":"Has the system been run on recognized, category-specific benchmarks?",
"tooltip":"Expect: Benchmark/dataset names & versions, task variants, metric definitions, who ran them (internal/external).",
"hint":"List benchmarks, dataset versions and who executed them."
},
{
"id":"A2",
"text":"Does the system meet pre-set quantitative thresholds for acceptable performance under applicable regulations?",
"tooltip":"Expect: Numeric scores vs. regulatory/compliance thresholds (e.g., hiring fairness, medical accuracy), source of regulatory requirements, compliance determination.",
"hint":"Provide numeric scores and regulatory mappings."
},
{
"id":"A3",
"text":"Has performance been compared to baselines, SOTA, previous versions, and other comparable systems?",
"tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.",
"hint":"Provide comparative scores and targets."
},
{
"id":"A4",
"text":"Has the system been tested under adversarial inputs, extreme loads, or distribution shift?",
"tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.",
"hint":"Describe stress tests and observed failure rates."
},
{
"id":"A5",
"text":"Is performance measured in the wild with automated monitors?",
"tooltip":"Expect: Live metrics tracked (e.g., error rates, drift, latency), sampling cadence, alert thresholds.",
"hint":"List live metrics and alerting rules."
},
{
"id":"A6",
"text":"Have you quantified train–test overlap or leakage risks that could inflate results?",
"tooltip":"Expect: Procedure (e.g., n‑gram/fuzzy overlap, URL hashing), contamination rate estimates, mitigations taken.",
"hint":"Report contamination checks and mitigation steps."
}
],
"processQuestions": [
{
"id":"B1",
"text":"Are the capability/risk claims and applicability for this category clearly documented?",
"tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.",
"hint":"Define scope and hypotheses."
},
{
"id":"B2",
"text":"Can others reproduce the results?",
"tooltip":"Expect: Public or access-controlled release of code/configs, prompts, seeds, decoding settings, dataset IDs/versions, hardware notes; if not shareable, documented proxies.",
"hint":"Point to code, data or proxies to reproduce results."
},
{
"id":"B3",
"text":"Have domain experts/affected users reviewed interpretations of results?",
"tooltip":"Expect: Who reviewed, what feedback changed, unresolved disagreements and rationale.",
"hint":"List reviewers and key feedback."
},
{
"id":"B4",
"text":"Do figures communicate results without distortion and with uncertainty/context?",
"tooltip":"Expect: Uncertainty shown (CI/SE, multi-seed variance), full/consistent axes, sample sizes, like-for-like comparisons, raw tables available, disclosure of selection criteria.",
"hint":"Ensure figures include uncertainty disclosures."
},
{
"id":"B5",
"text":"Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?",
"tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.",
"hint":"Map evaluation practices to standards and note gaps."
},
{
"id":"B6",
"text":"Is there a process to re-run/adapt evals as models, data, or risks change, including mitigation and retest procedures?",
"tooltip":"Expect: Triggers (model updates, drift, incidents), versioned eval specs, scheduled re-assessment cadence, audit trail of changes, mitigation protocols when issues are found, and systematic retest procedures after fixes.",
"hint":"Describe triggers and re-evaluation procedures."
}
],
"benchmarkSourceFields": [
{"name":"benchmarkName","label":"Benchmark/Dataset Name","type":"text","placeholder":"e.g., MMLU v1"},
{"name":"version","label":"Version","type":"text","placeholder":"e.g., v1.2"},
{"name":"taskVariants","label":"Task Variants","type":"text","placeholder":"e.g., multiple choice, generation"},
{"name":"metrics","label":"Metrics","type":"text","placeholder":"e.g., accuracy, F1"},
{"name":"url","label":"URL","type":"text","placeholder":"https://..."},
{"name":"description","label":"Description","type":"textarea","placeholder":"Describe the benchmark or test"},
{"name":"sourceType","label":"Source Type","type":"radio","options":[{"value":"internal","label":"Internal"},{"value":"external","label":"External"},{"value":"cooperative","label":"Cooperative"}]},
{"name":"score","label":"Score","type":"text","placeholder":"e.g., 85%"},
{"name":"confidenceInterval","label":"Confidence Interval","type":"text","placeholder":"e.g., 95% CI [90,94]"}
],
"processSourceFields": [
{"name":"url","label":"URL","type":"text","placeholder":"https://..."},
{"name":"documentType","label":"Document Type","type":"text","placeholder":"e.g., Policy, Procedure"},
{"name":"title","label":"Title","type":"text","placeholder":"Document title"},
{"name":"author","label":"Author","type":"text","placeholder":"Author or owner"},
{"name":"organization","label":"Organization","type":"text","placeholder":"Owning org"},
{"name":"date","label":"Date","type":"text","placeholder":"YYYY-MM-DD"},
{"name":"description","label":"Description","type":"textarea","placeholder":"Describe the documentation"}
]
}