Spaces:
Running
Running
ping98k
Refactor balanced K-Means implementation; update beta parameter default value, handle empty embeddings, and ensure proper centroid updates. Enhance UI to include beta input and adjust event handler to read beta value for clustering.
65b6e6f
| import { UMAP } from "https://cdn.jsdelivr.net/npm/[email protected]/+esm"; | |
| function balancedKMeans(emb, k, beta = 0.01, maxIter = 100) { | |
| if (!emb.length) return { labels: [], centroids: [] }; | |
| const n = emb.length, d = emb[0].length; | |
| k = Math.max(2, Math.min(k, n)); // guard k ≤ n | |
| let cent = kmeansPlusPlusInit(emb, k); | |
| const lab = new Uint32Array(n).fill(k); // start “unassigned” | |
| const cnt = new Uint32Array(k); | |
| for (let iter = 0; iter < maxIter; ++iter) { | |
| let moved = false; | |
| cnt.fill(0); | |
| // ── assignment with size penalty ── | |
| for (let i = 0; i < n; ++i) { | |
| let best = 0, bestCost = Infinity; | |
| for (let c = 0; c < k; ++c) { | |
| let dist = 0; | |
| for (let j = 0; j < d; ++j) { | |
| const diff = emb[i][j] - cent[c][j]; | |
| dist += diff * diff; | |
| } | |
| const sizePenalty = beta * (2 * cnt[c] + 1); | |
| const cost = dist + sizePenalty; | |
| if (cost < bestCost) { bestCost = cost; best = c; } | |
| } | |
| if (lab[i] !== best) { lab[i] = best; moved = true; } | |
| cnt[best]++; | |
| } | |
| // ── update centroids ── | |
| cent = Array.from({ length: k }, () => new Array(d).fill(0)); | |
| for (let i = 0; i < n; ++i) | |
| for (let j = 0; j < d; ++j) cent[lab[i]][j] += emb[i][j]; | |
| for (let c = 0; c < k; ++c) | |
| if (cnt[c]) { | |
| const inv = 1 / cnt[c]; | |
| for (let j = 0; j < d; ++j) cent[c][j] *= inv; | |
| } | |
| if (!moved) break; // converged | |
| } | |
| return { labels: Array.from(lab), centroids: cent }; | |
| } | |
| function kmeansPlusPlusInit(embeddings, k) { | |
| const n = embeddings.length; | |
| const dim = embeddings[0].length; | |
| const centroids = [embeddings[Math.floor(Math.random() * n)].slice()]; | |
| const d2 = new Float64Array(n); | |
| for (let c = 1; c < k; ++c) { | |
| let total = 0; | |
| for (let i = 0; i < n; ++i) { | |
| let dist = 0; | |
| for (let d = 0; d < dim; ++d) { | |
| const diff = embeddings[i][d] - centroids[c - 1][d]; | |
| dist += diff * diff; | |
| } | |
| if (c === 1 || dist < d2[i]) d2[i] = dist; | |
| total += d2[i]; | |
| } | |
| let r = Math.random() * total; | |
| let idx = 0; | |
| while (r > d2[idx]) r -= d2[idx++]; | |
| centroids.push(embeddings[idx].slice()); | |
| } | |
| return centroids; | |
| } | |
| export function kmeans(embeddings, k, maxIter = 100) { | |
| const n = embeddings.length; | |
| if (n === 0) return { labels: [], centroids: [] }; | |
| k = Math.max(2, Math.min(k, n)); | |
| const dim = embeddings[0].length; | |
| let centroids = kmeansPlusPlusInit(embeddings, k); | |
| const labels = new Uint32Array(n); | |
| const reseed = () => { | |
| let farIdx = 0; | |
| let farDist = -1; | |
| for (let i = 0; i < n; ++i) { | |
| let min = Infinity; | |
| for (let c = 0; c < k; ++c) { | |
| let dist = 0; | |
| for (let d = 0; d < dim; ++d) { | |
| const diff = embeddings[i][d] - centroids[c][d]; | |
| dist += diff * diff; | |
| } | |
| if (dist < min) min = dist; | |
| } | |
| if (min > farDist) { | |
| farDist = min; | |
| farIdx = i; | |
| } | |
| } | |
| return embeddings[farIdx].slice(); | |
| }; | |
| for (let iter = 0; iter < maxIter; ++iter) { | |
| let moved = false; | |
| for (let i = 0; i < n; ++i) { | |
| let best = 0; | |
| let bestDist = Infinity; | |
| for (let c = 0; c < k; ++c) { | |
| let dist = 0; | |
| for (let d = 0; d < dim; ++d) { | |
| const diff = embeddings[i][d] - centroids[c][d]; | |
| dist += diff * diff; | |
| } | |
| if (dist < bestDist) { | |
| bestDist = dist; | |
| best = c; | |
| } | |
| } | |
| if (labels[i] !== best) { | |
| labels[i] = best; | |
| moved = true; | |
| } | |
| } | |
| const counts = new Uint32Array(k); | |
| centroids = Array.from({ length: k }, () => new Array(dim).fill(0)); | |
| for (let i = 0; i < n; ++i) { | |
| counts[labels[i]]++; | |
| for (let d = 0; d < dim; ++d) | |
| centroids[labels[i]][d] += embeddings[i][d]; | |
| } | |
| for (let c = 0; c < k; ++c) { | |
| if (counts[c] === 0) { | |
| centroids[c] = reseed(); | |
| } else { | |
| const inv = 1 / counts[c]; | |
| for (let d = 0; d < dim; ++d) centroids[c][d] *= inv; | |
| } | |
| } | |
| if (!moved) break; | |
| } | |
| return { labels: Array.from(labels), centroids }; | |
| } | |
| export function runUMAP(embeddings, nNeighbors = 15) { | |
| const umap = new UMAP({ | |
| nComponents: 2, | |
| nNeighbors: Math.max(1, Math.min(nNeighbors, embeddings.length - 1)), | |
| minDist: 0.1 | |
| }); | |
| return umap.fit(embeddings); | |
| } | |
| export { balancedKMeans }; | |