Spaces:
Running
Running
Update pages/5 Burst Detection.py
Browse files
pages/5 Burst Detection.py
CHANGED
|
@@ -135,7 +135,10 @@ def clean_data(df):
|
|
| 135 |
df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
|
| 136 |
|
| 137 |
# Vectorize processed text
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
| 139 |
X = vectorizer.fit_transform(df['processed'].tolist())
|
| 140 |
|
| 141 |
# Create DataFrame from the Document-Term Matrix (DTM)
|
|
@@ -350,14 +353,16 @@ uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
|
| 350 |
|
| 351 |
if uploaded_file is not None:
|
| 352 |
try:
|
| 353 |
-
c1, c2, c3 = st.columns([
|
| 354 |
top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
|
| 355 |
viz_selected = c2.selectbox("Option for visualization",
|
| 356 |
("Line graph", "Scatter plot"), on_change=reset_all)
|
| 357 |
-
running_total = c3.selectbox("
|
| 358 |
("Running total", "By occurrences each year"), on_change=reset_all)
|
|
|
|
|
|
|
| 359 |
|
| 360 |
-
d1, d2 = st.columns([
|
| 361 |
df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
|
| 362 |
col_name = d1.selectbox("Select column to analyze",
|
| 363 |
(coldf), on_change=reset_all)
|
|
|
|
| 135 |
df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
|
| 136 |
|
| 137 |
# Vectorize processed text
|
| 138 |
+
if count_method == "Document Frequency":
|
| 139 |
+
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split(), binary=True)
|
| 140 |
+
else:
|
| 141 |
+
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())
|
| 142 |
X = vectorizer.fit_transform(df['processed'].tolist())
|
| 143 |
|
| 144 |
# Create DataFrame from the Document-Term Matrix (DTM)
|
|
|
|
| 353 |
|
| 354 |
if uploaded_file is not None:
|
| 355 |
try:
|
| 356 |
+
c1, c2, c3, c4 = st.columns([2,2,3,3])
|
| 357 |
top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
|
| 358 |
viz_selected = c2.selectbox("Option for visualization",
|
| 359 |
("Line graph", "Scatter plot"), on_change=reset_all)
|
| 360 |
+
running_total = c3.selectbox("Calculation method",
|
| 361 |
("Running total", "By occurrences each year"), on_change=reset_all)
|
| 362 |
+
count_method = c4.selectbox("Count by",
|
| 363 |
+
("Term Frequency", "Document Frequency"), on_change=reset_all)
|
| 364 |
|
| 365 |
+
d1, d2 = st.columns([2,8])
|
| 366 |
df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
|
| 367 |
col_name = d1.selectbox("Select column to analyze",
|
| 368 |
(coldf), on_change=reset_all)
|