Spaces:

evaleval
/

general-eval-card

Running

App Files Files Community

general-eval-card / schema /evaluation-schema.json

evijit HF Staff

fixed some bugs

a71573c 7 months ago

raw

history blame contribute delete

7.69 kB

	{
	"version": "1.0",
	"categories": [
	{"id":"language-communication","name":"Language & Communication","type":"capability"},
	{"id":"social-intelligence","name":"Social Intelligence & Interaction","type":"capability"},
	{"id":"problem-solving","name":"Problem Solving","type":"capability"},
	{"id":"creativity-innovation","name":"Creativity & Innovation","type":"capability"},
	{"id":"learning-memory","name":"Learning & Memory","type":"capability"},
	{"id":"perception-vision","name":"Perception & Vision","type":"capability"},
	{"id":"physical-manipulation","name":"Physical Manipulation & Motor Skills","type":"capability"},
	{"id":"metacognition","name":"Metacognition & Self-Awareness","type":"capability"},
	{"id":"robotic-intelligence","name":"Robotic Intelligence & Autonomy","type":"capability"},
	{"id":"harmful-content","name":"Harmful Content Generation","type":"risk"},
	{"id":"information-integrity","name":"Information Integrity & Misinformation","type":"risk"},
	{"id":"privacy-data","name":"Privacy & Data Protection","type":"risk"},
	{"id":"bias-fairness","name":"Bias & Fairness","type":"risk"},
	{"id":"security-robustness","name":"Security & Robustness","type":"risk"},
	{"id":"dangerous-capabilities","name":"Dangerous Capabilities & Misuse","type":"risk"},
	{"id":"human-ai-interaction","name":"Human-AI Interaction Risks","type":"risk"},
	{"id":"environmental-impact","name":"Environmental & Resource Impact","type":"risk"},
	{"id":"economic-displacement","name":"Economic & Labor Displacement","type":"risk"},
	{"id":"governance-accountability","name":"Governance & Accountability","type":"risk"},
	{"id":"value-chain","name":"Value Chain & Supply Chain Risks","type":"risk"}
	],
	"benchmarkQuestions": [
	{
	"id":"A1",
	"text":"Has the system been run on recognized, category-specific benchmarks?",
	"tooltip":"Expect: Benchmark/dataset names & versions, task variants, metric definitions, who ran them (internal/external).",
	"hint":"List benchmarks, dataset versions and who executed them."
	},
	{
	"id":"A2",
	"text":"Does the system meet pre-set quantitative thresholds for acceptable performance under applicable regulations?",
	"tooltip":"Expect: Numeric scores vs. regulatory/compliance thresholds (e.g., hiring fairness, medical accuracy), source of regulatory requirements, compliance determination.",
	"hint":"Provide numeric scores and regulatory mappings."
	},
	{
	"id":"A3",
	"text":"Has performance been compared to baselines, SOTA, previous versions, and other comparable systems?",
	"tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.",
	"hint":"Provide comparative scores and targets."
	},
	{
	"id":"A4",
	"text":"Has the system been tested under adversarial inputs, extreme loads, or distribution shift?",
	"tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.",
	"hint":"Describe stress tests and observed failure rates."
	},
	{
	"id":"A5",
	"text":"Is performance measured in the wild with automated monitors?",
	"tooltip":"Expect: Live metrics tracked (e.g., error rates, drift, latency), sampling cadence, alert thresholds.",
	"hint":"List live metrics and alerting rules."
	},
	{
	"id":"A6",
	"text":"Have you quantified train–test overlap or leakage risks that could inflate results?",
	"tooltip":"Expect: Procedure (e.g., n‑gram/fuzzy overlap, URL hashing), contamination rate estimates, mitigations taken.",
	"hint":"Report contamination checks and mitigation steps."
	}
	],
	"processQuestions": [
	{
	"id":"B1",
	"text":"Are the capability/risk claims and applicability for this category clearly documented?",
	"tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.",
	"hint":"Define scope and hypotheses."
	},
	{
	"id":"B2",
	"text":"Can others reproduce the results?",
	"tooltip":"Expect: Public or access-controlled release of code/configs, prompts, seeds, decoding settings, dataset IDs/versions, hardware notes; if not shareable, documented proxies.",
	"hint":"Point to code, data or proxies to reproduce results."
	},
	{
	"id":"B3",
	"text":"Have domain experts/affected users reviewed interpretations of results?",
	"tooltip":"Expect: Who reviewed, what feedback changed, unresolved disagreements and rationale.",
	"hint":"List reviewers and key feedback."
	},
	{
	"id":"B4",
	"text":"Do figures communicate results without distortion and with uncertainty/context?",
	"tooltip":"Expect: Uncertainty shown (CI/SE, multi-seed variance), full/consistent axes, sample sizes, like-for-like comparisons, raw tables available, disclosure of selection criteria.",
	"hint":"Ensure figures include uncertainty disclosures."
	},
	{
	"id":"B5",
	"text":"Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?",
	"tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.",
	"hint":"Map evaluation practices to standards and note gaps."
	},
	{
	"id":"B6",
	"text":"Is there a process to re-run/adapt evals as models, data, or risks change, including mitigation and retest procedures?",
	"tooltip":"Expect: Triggers (model updates, drift, incidents), versioned eval specs, scheduled re-assessment cadence, audit trail of changes, mitigation protocols when issues are found, and systematic retest procedures after fixes.",
	"hint":"Describe triggers and re-evaluation procedures."
	}
	],
	"benchmarkSourceFields": [
	{"name":"benchmarkName","label":"Benchmark/Dataset Name","type":"text","placeholder":"e.g., MMLU v1"},
	{"name":"version","label":"Version","type":"text","placeholder":"e.g., v1.2"},
	{"name":"taskVariants","label":"Task Variants","type":"text","placeholder":"e.g., multiple choice, generation"},
	{"name":"metrics","label":"Metrics","type":"text","placeholder":"e.g., accuracy, F1"},
	{"name":"url","label":"URL","type":"text","placeholder":"https://..."},
	{"name":"description","label":"Description","type":"textarea","placeholder":"Describe the benchmark or test"},
	{"name":"sourceType","label":"Source Type","type":"radio","options":[{"value":"internal","label":"Internal"},{"value":"external","label":"External"},{"value":"cooperative","label":"Cooperative"}]},
	{"name":"score","label":"Score","type":"text","placeholder":"e.g., 85%"},
	{"name":"confidenceInterval","label":"Confidence Interval","type":"text","placeholder":"e.g., 95% CI [90,94]"}
	],
	"processSourceFields": [
	{"name":"url","label":"URL","type":"text","placeholder":"https://..."},
	{"name":"documentType","label":"Document Type","type":"text","placeholder":"e.g., Policy, Procedure"},
	{"name":"title","label":"Title","type":"text","placeholder":"Document title"},
	{"name":"author","label":"Author","type":"text","placeholder":"Author or owner"},
	{"name":"organization","label":"Organization","type":"text","placeholder":"Owning org"},
	{"name":"date","label":"Date","type":"text","placeholder":"YYYY-MM-DD"},
	{"name":"description","label":"Description","type":"textarea","placeholder":"Describe the documentation"}
	]
	}