import { UMAP } from "https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm"; function balancedKMeans(emb, k, beta = 0.01, maxIter = 100) { if (!emb.length) return { labels: [], centroids: [] }; const n = emb.length, d = emb[0].length; k = Math.max(2, Math.min(k, n)); // guard k ≤ n let cent = kmeansPlusPlusInit(emb, k); const lab = new Uint32Array(n).fill(k); // start “unassigned” const cnt = new Uint32Array(k); for (let iter = 0; iter < maxIter; ++iter) { let moved = false; cnt.fill(0); // ── assignment with size penalty ── for (let i = 0; i < n; ++i) { let best = 0, bestCost = Infinity; for (let c = 0; c < k; ++c) { let dist = 0; for (let j = 0; j < d; ++j) { const diff = emb[i][j] - cent[c][j]; dist += diff * diff; } const sizePenalty = beta * (2 * cnt[c] + 1); const cost = dist + sizePenalty; if (cost < bestCost) { bestCost = cost; best = c; } } if (lab[i] !== best) { lab[i] = best; moved = true; } cnt[best]++; } // ── update centroids ── cent = Array.from({ length: k }, () => new Array(d).fill(0)); for (let i = 0; i < n; ++i) for (let j = 0; j < d; ++j) cent[lab[i]][j] += emb[i][j]; for (let c = 0; c < k; ++c) if (cnt[c]) { const inv = 1 / cnt[c]; for (let j = 0; j < d; ++j) cent[c][j] *= inv; } if (!moved) break; // converged } return { labels: Array.from(lab), centroids: cent }; } function kmeansPlusPlusInit(embeddings, k) { const n = embeddings.length; const dim = embeddings[0].length; const centroids = [embeddings[Math.floor(Math.random() * n)].slice()]; const d2 = new Float64Array(n); for (let c = 1; c < k; ++c) { let total = 0; for (let i = 0; i < n; ++i) { let dist = 0; for (let d = 0; d < dim; ++d) { const diff = embeddings[i][d] - centroids[c - 1][d]; dist += diff * diff; } if (c === 1 || dist < d2[i]) d2[i] = dist; total += d2[i]; } let r = Math.random() * total; let idx = 0; while (r > d2[idx]) r -= d2[idx++]; centroids.push(embeddings[idx].slice()); } return centroids; } export function kmeans(embeddings, k, maxIter = 100) { const n = embeddings.length; if (n === 0) return { labels: [], centroids: [] }; k = Math.max(2, Math.min(k, n)); const dim = embeddings[0].length; let centroids = kmeansPlusPlusInit(embeddings, k); const labels = new Uint32Array(n); const reseed = () => { let farIdx = 0; let farDist = -1; for (let i = 0; i < n; ++i) { let min = Infinity; for (let c = 0; c < k; ++c) { let dist = 0; for (let d = 0; d < dim; ++d) { const diff = embeddings[i][d] - centroids[c][d]; dist += diff * diff; } if (dist < min) min = dist; } if (min > farDist) { farDist = min; farIdx = i; } } return embeddings[farIdx].slice(); }; for (let iter = 0; iter < maxIter; ++iter) { let moved = false; for (let i = 0; i < n; ++i) { let best = 0; let bestDist = Infinity; for (let c = 0; c < k; ++c) { let dist = 0; for (let d = 0; d < dim; ++d) { const diff = embeddings[i][d] - centroids[c][d]; dist += diff * diff; } if (dist < bestDist) { bestDist = dist; best = c; } } if (labels[i] !== best) { labels[i] = best; moved = true; } } const counts = new Uint32Array(k); centroids = Array.from({ length: k }, () => new Array(dim).fill(0)); for (let i = 0; i < n; ++i) { counts[labels[i]]++; for (let d = 0; d < dim; ++d) centroids[labels[i]][d] += embeddings[i][d]; } for (let c = 0; c < k; ++c) { if (counts[c] === 0) { centroids[c] = reseed(); } else { const inv = 1 / counts[c]; for (let d = 0; d < dim; ++d) centroids[c][d] *= inv; } } if (!moved) break; } return { labels: Array.from(labels), centroids }; } export function runUMAP(embeddings, nNeighbors = 15) { const umap = new UMAP({ nComponents: 2, nNeighbors: Math.max(1, Math.min(nNeighbors, embeddings.length - 1)), minDist: 0.1 }); return umap.fit(embeddings); } export { balancedKMeans };