Spaces:
Running
Running
ping98k
commited on
Commit
·
935873d
1
Parent(s):
aaffa94
Enhance cluster naming and K-Means functionality; implement random selection for cluster names, improve centroid initialization, and adjust UMAP parameters for better projection accuracy.
Browse files- cluster_naming.js +14 -1
- clustering.js +58 -22
- main.js +1 -1
cluster_naming.js
CHANGED
|
@@ -5,7 +5,20 @@ const tokenizer = await AutoTokenizer.from_pretrained("onnx-community/Qwen3-0.6B
|
|
| 5 |
const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
|
| 6 |
|
| 7 |
export async function nameCluster(lines) {
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
const messages = [
|
| 10 |
{ role: "system", content: prompt_cluster },
|
| 11 |
{ role: "user", content: `Input:\n${joined}\nOutput:` }
|
|
|
|
| 5 |
const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
|
| 6 |
|
| 7 |
export async function nameCluster(lines) {
|
| 8 |
+
// If more than 5 lines, randomly pick 5
|
| 9 |
+
let selected = lines;
|
| 10 |
+
if (lines.length > 5) {
|
| 11 |
+
selected = [];
|
| 12 |
+
const used = new Set();
|
| 13 |
+
while (selected.length < 5) {
|
| 14 |
+
const idx = Math.floor(Math.random() * lines.length);
|
| 15 |
+
if (!used.has(idx)) {
|
| 16 |
+
used.add(idx);
|
| 17 |
+
selected.push(lines[idx]);
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
| 21 |
+
const joined = selected.join("\n");
|
| 22 |
const messages = [
|
| 23 |
{ role: "system", content: prompt_cluster },
|
| 24 |
{ role: "user", content: `Input:\n${joined}\nOutput:` }
|
clustering.js
CHANGED
|
@@ -1,37 +1,69 @@
|
|
| 1 |
import { UMAP } from "https://cdn.jsdelivr.net/npm/[email protected]/+esm";
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
export function kmeans(embeddings, k, maxIter = 100) {
|
| 4 |
const n = embeddings.length;
|
|
|
|
|
|
|
| 5 |
const dim = embeddings[0].length;
|
| 6 |
-
let centroids =
|
| 7 |
-
|
| 8 |
|
| 9 |
const reseed = () => {
|
| 10 |
-
let
|
|
|
|
| 11 |
for (let i = 0; i < n; ++i) {
|
| 12 |
-
let
|
| 13 |
for (let c = 0; c < k; ++c) {
|
| 14 |
let dist = 0;
|
| 15 |
-
for (let d = 0; d < dim; ++d)
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
}
|
| 19 |
-
if (
|
| 20 |
-
|
| 21 |
-
|
| 22 |
}
|
| 23 |
}
|
| 24 |
-
return embeddings[
|
| 25 |
};
|
| 26 |
|
| 27 |
for (let iter = 0; iter < maxIter; ++iter) {
|
| 28 |
-
let
|
| 29 |
for (let i = 0; i < n; ++i) {
|
| 30 |
-
let best = 0
|
|
|
|
| 31 |
for (let c = 0; c < k; ++c) {
|
| 32 |
let dist = 0;
|
| 33 |
-
for (let d = 0; d < dim; ++d)
|
| 34 |
-
|
|
|
|
|
|
|
| 35 |
if (dist < bestDist) {
|
| 36 |
bestDist = dist;
|
| 37 |
best = c;
|
|
@@ -39,11 +71,11 @@ export function kmeans(embeddings, k, maxIter = 100) {
|
|
| 39 |
}
|
| 40 |
if (labels[i] !== best) {
|
| 41 |
labels[i] = best;
|
| 42 |
-
|
| 43 |
}
|
| 44 |
}
|
|
|
|
| 45 |
centroids = Array.from({ length: k }, () => new Array(dim).fill(0));
|
| 46 |
-
const counts = new Array(k).fill(0);
|
| 47 |
for (let i = 0; i < n; ++i) {
|
| 48 |
counts[labels[i]]++;
|
| 49 |
for (let d = 0; d < dim; ++d)
|
|
@@ -53,16 +85,20 @@ export function kmeans(embeddings, k, maxIter = 100) {
|
|
| 53 |
if (counts[c] === 0) {
|
| 54 |
centroids[c] = reseed();
|
| 55 |
} else {
|
| 56 |
-
|
| 57 |
-
|
| 58 |
}
|
| 59 |
}
|
| 60 |
-
if (!
|
| 61 |
}
|
| 62 |
-
return { labels, centroids };
|
| 63 |
}
|
| 64 |
|
| 65 |
export function runUMAP(embeddings, nNeighbors = 15) {
|
| 66 |
-
const umap = new UMAP({
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
return umap.fit(embeddings);
|
| 68 |
}
|
|
|
|
| 1 |
import { UMAP } from "https://cdn.jsdelivr.net/npm/[email protected]/+esm";
|
| 2 |
|
| 3 |
+
function kmeansPlusPlusInit(embeddings, k) {
|
| 4 |
+
const n = embeddings.length;
|
| 5 |
+
const dim = embeddings[0].length;
|
| 6 |
+
const centroids = [embeddings[Math.floor(Math.random() * n)].slice()];
|
| 7 |
+
const d2 = new Float64Array(n);
|
| 8 |
+
for (let c = 1; c < k; ++c) {
|
| 9 |
+
let total = 0;
|
| 10 |
+
for (let i = 0; i < n; ++i) {
|
| 11 |
+
let dist = 0;
|
| 12 |
+
for (let d = 0; d < dim; ++d) {
|
| 13 |
+
const diff = embeddings[i][d] - centroids[c - 1][d];
|
| 14 |
+
dist += diff * diff;
|
| 15 |
+
}
|
| 16 |
+
if (c === 1 || dist < d2[i]) d2[i] = dist;
|
| 17 |
+
total += d2[i];
|
| 18 |
+
}
|
| 19 |
+
let r = Math.random() * total;
|
| 20 |
+
let idx = 0;
|
| 21 |
+
while (r > d2[idx]) r -= d2[idx++];
|
| 22 |
+
centroids.push(embeddings[idx].slice());
|
| 23 |
+
}
|
| 24 |
+
return centroids;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
export function kmeans(embeddings, k, maxIter = 100) {
|
| 28 |
const n = embeddings.length;
|
| 29 |
+
if (n === 0) return { labels: [], centroids: [] };
|
| 30 |
+
k = Math.max(2, Math.min(k, n));
|
| 31 |
const dim = embeddings[0].length;
|
| 32 |
+
let centroids = kmeansPlusPlusInit(embeddings, k);
|
| 33 |
+
const labels = new Uint32Array(n);
|
| 34 |
|
| 35 |
const reseed = () => {
|
| 36 |
+
let farIdx = 0;
|
| 37 |
+
let farDist = -1;
|
| 38 |
for (let i = 0; i < n; ++i) {
|
| 39 |
+
let min = Infinity;
|
| 40 |
for (let c = 0; c < k; ++c) {
|
| 41 |
let dist = 0;
|
| 42 |
+
for (let d = 0; d < dim; ++d) {
|
| 43 |
+
const diff = embeddings[i][d] - centroids[c][d];
|
| 44 |
+
dist += diff * diff;
|
| 45 |
+
}
|
| 46 |
+
if (dist < min) min = dist;
|
| 47 |
}
|
| 48 |
+
if (min > farDist) {
|
| 49 |
+
farDist = min;
|
| 50 |
+
farIdx = i;
|
| 51 |
}
|
| 52 |
}
|
| 53 |
+
return embeddings[farIdx].slice();
|
| 54 |
};
|
| 55 |
|
| 56 |
for (let iter = 0; iter < maxIter; ++iter) {
|
| 57 |
+
let moved = false;
|
| 58 |
for (let i = 0; i < n; ++i) {
|
| 59 |
+
let best = 0;
|
| 60 |
+
let bestDist = Infinity;
|
| 61 |
for (let c = 0; c < k; ++c) {
|
| 62 |
let dist = 0;
|
| 63 |
+
for (let d = 0; d < dim; ++d) {
|
| 64 |
+
const diff = embeddings[i][d] - centroids[c][d];
|
| 65 |
+
dist += diff * diff;
|
| 66 |
+
}
|
| 67 |
if (dist < bestDist) {
|
| 68 |
bestDist = dist;
|
| 69 |
best = c;
|
|
|
|
| 71 |
}
|
| 72 |
if (labels[i] !== best) {
|
| 73 |
labels[i] = best;
|
| 74 |
+
moved = true;
|
| 75 |
}
|
| 76 |
}
|
| 77 |
+
const counts = new Uint32Array(k);
|
| 78 |
centroids = Array.from({ length: k }, () => new Array(dim).fill(0));
|
|
|
|
| 79 |
for (let i = 0; i < n; ++i) {
|
| 80 |
counts[labels[i]]++;
|
| 81 |
for (let d = 0; d < dim; ++d)
|
|
|
|
| 85 |
if (counts[c] === 0) {
|
| 86 |
centroids[c] = reseed();
|
| 87 |
} else {
|
| 88 |
+
const inv = 1 / counts[c];
|
| 89 |
+
for (let d = 0; d < dim; ++d) centroids[c][d] *= inv;
|
| 90 |
}
|
| 91 |
}
|
| 92 |
+
if (!moved) break;
|
| 93 |
}
|
| 94 |
+
return { labels: Array.from(labels), centroids };
|
| 95 |
}
|
| 96 |
|
| 97 |
export function runUMAP(embeddings, nNeighbors = 15) {
|
| 98 |
+
const umap = new UMAP({
|
| 99 |
+
nComponents: 2,
|
| 100 |
+
nNeighbors: Math.max(1, Math.min(nNeighbors, embeddings.length - 1)),
|
| 101 |
+
minDist: 0.1
|
| 102 |
+
});
|
| 103 |
return umap.fit(embeddings);
|
| 104 |
}
|
main.js
CHANGED
|
@@ -62,7 +62,7 @@ document.getElementById("kmeans-btn").onclick = async () => {
|
|
| 62 |
// UMAP projection
|
| 63 |
const { UMAP } = await import('https://cdn.jsdelivr.net/npm/[email protected]/+esm');
|
| 64 |
const nNeighbors = Math.max(1, Math.min(lines.length - 1, 15));
|
| 65 |
-
const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.
|
| 66 |
const proj = umap.fit(embeddings);
|
| 67 |
// Group lines by cluster
|
| 68 |
const clustered = Array.from({ length: k }, () => []);
|
|
|
|
| 62 |
// UMAP projection
|
| 63 |
const { UMAP } = await import('https://cdn.jsdelivr.net/npm/[email protected]/+esm');
|
| 64 |
const nNeighbors = Math.max(1, Math.min(lines.length - 1, 15));
|
| 65 |
+
const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.2, metric: "cosine" });
|
| 66 |
const proj = umap.fit(embeddings);
|
| 67 |
// Group lines by cluster
|
| 68 |
const clustered = Array.from({ length: k }, () => []);
|