Spaces:
Running
Running
Update github_repo_analyzer.py
Browse files- github_repo_analyzer.py +36 -14
github_repo_analyzer.py
CHANGED
|
@@ -2,22 +2,25 @@ import os
|
|
| 2 |
import sys
|
| 3 |
import tempfile
|
| 4 |
import shutil
|
| 5 |
-
from urllib.parse import urlparse, quote
|
| 6 |
import requests
|
| 7 |
-
from github import Github
|
| 8 |
-
from git import Repo
|
| 9 |
-
from collections import defaultdict
|
| 10 |
import time
|
| 11 |
-
import numpy as np
|
| 12 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 13 |
-
from sklearn.cluster import KMeans
|
| 14 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 15 |
import subprocess
|
| 16 |
import json
|
| 17 |
-
from pathlib import Path
|
| 18 |
import traceback
|
| 19 |
import argparse
|
| 20 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
def run_semgrep(repo_path):
|
| 23 |
try:
|
|
@@ -211,6 +214,12 @@ def parse_llm_response(response):
|
|
| 211 |
return []
|
| 212 |
|
| 213 |
def cluster_and_filter_items(items, n_clusters=5, n_items=10):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
# Combine title and body for text analysis
|
| 215 |
texts = [f"{item['title']} {item['body']}" for item in items]
|
| 216 |
|
|
@@ -218,27 +227,40 @@ def cluster_and_filter_items(items, n_clusters=5, n_items=10):
|
|
| 218 |
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
|
| 219 |
tfidf_matrix = vectorizer.fit_transform(texts)
|
| 220 |
|
|
|
|
|
|
|
|
|
|
| 221 |
# Perform clustering
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
| 224 |
|
| 225 |
# Get cluster centers
|
| 226 |
cluster_centers = kmeans.cluster_centers_
|
| 227 |
|
| 228 |
# Find items closest to cluster centers
|
| 229 |
filtered_items = []
|
| 230 |
-
for i in range(
|
| 231 |
cluster_items = [item for item, label in zip(items, kmeans.labels_) if label == i]
|
| 232 |
cluster_vectors = tfidf_matrix[kmeans.labels_ == i]
|
| 233 |
|
|
|
|
|
|
|
|
|
|
| 234 |
# Calculate similarities to cluster center
|
| 235 |
similarities = cosine_similarity(cluster_vectors, cluster_centers[i].reshape(1, -1)).flatten()
|
| 236 |
|
| 237 |
# Sort items by similarity and select top ones
|
| 238 |
sorted_items = [x for _, x in sorted(zip(similarities, cluster_items), key=lambda pair: pair[0], reverse=True)]
|
| 239 |
-
filtered_items.extend(sorted_items[:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
-
return filtered_items
|
| 242 |
|
| 243 |
def safe_filter_open_items(open_items, closed_patterns, n_items=10):
|
| 244 |
try:
|
|
|
|
| 2 |
import sys
|
| 3 |
import tempfile
|
| 4 |
import shutil
|
|
|
|
| 5 |
import requests
|
|
|
|
|
|
|
|
|
|
| 6 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
import subprocess
|
| 8 |
import json
|
|
|
|
| 9 |
import traceback
|
| 10 |
import argparse
|
| 11 |
import re
|
| 12 |
+
import warnings
|
| 13 |
+
import numpy as np
|
| 14 |
+
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from urllib.parse import urlparse, quote
|
| 18 |
+
from github import Github
|
| 19 |
+
from git import Repo
|
| 20 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 21 |
+
from sklearn.cluster import KMeans
|
| 22 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 23 |
+
from sklearn.exceptions import ConvergenceWarning
|
| 24 |
|
| 25 |
def run_semgrep(repo_path):
|
| 26 |
try:
|
|
|
|
| 214 |
return []
|
| 215 |
|
| 216 |
def cluster_and_filter_items(items, n_clusters=5, n_items=10):
|
| 217 |
+
if len(items) == 0:
|
| 218 |
+
return []
|
| 219 |
+
|
| 220 |
+
if len(items) <= n_items:
|
| 221 |
+
return items
|
| 222 |
+
|
| 223 |
# Combine title and body for text analysis
|
| 224 |
texts = [f"{item['title']} {item['body']}" for item in items]
|
| 225 |
|
|
|
|
| 227 |
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
|
| 228 |
tfidf_matrix = vectorizer.fit_transform(texts)
|
| 229 |
|
| 230 |
+
# Determine the number of clusters
|
| 231 |
+
n_clusters = min(n_clusters, len(items))
|
| 232 |
+
|
| 233 |
# Perform clustering
|
| 234 |
+
with warnings.catch_warnings():
|
| 235 |
+
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
| 236 |
+
kmeans = KMeans(n_clusters=n_clusters)
|
| 237 |
+
kmeans.fit(tfidf_matrix)
|
| 238 |
|
| 239 |
# Get cluster centers
|
| 240 |
cluster_centers = kmeans.cluster_centers_
|
| 241 |
|
| 242 |
# Find items closest to cluster centers
|
| 243 |
filtered_items = []
|
| 244 |
+
for i in range(n_clusters):
|
| 245 |
cluster_items = [item for item, label in zip(items, kmeans.labels_) if label == i]
|
| 246 |
cluster_vectors = tfidf_matrix[kmeans.labels_ == i]
|
| 247 |
|
| 248 |
+
if cluster_vectors.shape[0] == 0:
|
| 249 |
+
continue
|
| 250 |
+
|
| 251 |
# Calculate similarities to cluster center
|
| 252 |
similarities = cosine_similarity(cluster_vectors, cluster_centers[i].reshape(1, -1)).flatten()
|
| 253 |
|
| 254 |
# Sort items by similarity and select top ones
|
| 255 |
sorted_items = [x for _, x in sorted(zip(similarities, cluster_items), key=lambda pair: pair[0], reverse=True)]
|
| 256 |
+
filtered_items.extend(sorted_items[:max(1, n_items // n_clusters)])
|
| 257 |
+
|
| 258 |
+
# If we didn't get enough items, add more from the original list
|
| 259 |
+
if len(filtered_items) < n_items:
|
| 260 |
+
remaining_items = [item for item in items if item not in filtered_items]
|
| 261 |
+
filtered_items.extend(remaining_items[:n_items - len(filtered_items)])
|
| 262 |
|
| 263 |
+
return filtered_items[:n_items]
|
| 264 |
|
| 265 |
def safe_filter_open_items(open_items, closed_patterns, n_items=10):
|
| 266 |
try:
|