Spaces:
Runtime error
Runtime error
добавление файлов
Browse files- .DS_Store +0 -0
- app.py +196 -0
- lstm/lstm_model.pth +3 -0
- lstm/rnn_preprocessing.py +80 -0
- lstm/vocab_to_int.json +0 -0
- lstm/word2vec.model +3 -0
- requirements.txt +73 -0
- tf-idf/tf-idf.pkl +3 -0
- tf-idf/tf-idf_vectorizer.pkl +3 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import pickle
|
| 5 |
+
import time
|
| 6 |
+
from typing import Tuple
|
| 7 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
+
import transformers
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sklearn.model_selection import train_test_split
|
| 11 |
+
from sklearn.linear_model import LogisticRegression
|
| 12 |
+
from sklearn.metrics import f1_score
|
| 13 |
+
import torch
|
| 14 |
+
from transformers import AutoTokenizer, AutoModel
|
| 15 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 16 |
+
from sklearn.preprocessing import LabelEncoder
|
| 17 |
+
import re
|
| 18 |
+
import string
|
| 19 |
+
import numpy as np
|
| 20 |
+
import torch.nn as nn
|
| 21 |
+
import json
|
| 22 |
+
import gensim
|
| 23 |
+
import torch.nn.functional as F
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
st.title('10-я неделя DS. Классификация отзывов, определение токсичности и генерация текста')
|
| 27 |
+
|
| 28 |
+
st.sidebar.header('Выберите страницу')
|
| 29 |
+
page = st.sidebar.radio("Выберите страницу", ["Вводная информация", "Классификация отзывов", "Определение токсичности", "Генерация текста"])
|
| 30 |
+
|
| 31 |
+
if page == "Вводная информация":
|
| 32 |
+
|
| 33 |
+
st.subheader('*Задача №1*: Классификация отзывов на медицинские учреждения')
|
| 34 |
+
st.write('Задача в двух словах: необходимо дать классификацию отзыва тремя моделями, время, за которое происходит классификаци отзыва, а также таблицу сравнения моделей по F-1 macro для моделей')
|
| 35 |
+
|
| 36 |
+
st.subheader('*Задача №2*: Определение токсичности')
|
| 37 |
+
st.write('Задача в двух словах: Оценка степени токсичности пользовательского сообщения ')
|
| 38 |
+
|
| 39 |
+
st.subheader('*Задача №3*: Генерация текста')
|
| 40 |
+
st.write('Задача в двух словах: Генерация текста GPT-моделью по пользовательскому prompt')
|
| 41 |
+
|
| 42 |
+
st.subheader('☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️☀️')
|
| 43 |
+
|
| 44 |
+
st.subheader('Выполнила команда "BERT": Алексей А., Светлана, Алиса')
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if page == "Классификация отзывов":
|
| 48 |
+
# Загрузка tf-idf модели и векторайзера
|
| 49 |
+
with open('tf-idf/tf-idf.pkl', 'rb') as f:
|
| 50 |
+
model_tf = pickle.load(f)
|
| 51 |
+
|
| 52 |
+
with open('tf-idf/tf-idf_vectorizer.pkl', 'rb') as f:
|
| 53 |
+
vectorizer_tf = pickle.load(f)
|
| 54 |
+
|
| 55 |
+
# Загрузка словаря vocab_to_int и Word2Vec модели
|
| 56 |
+
with open('lstm/vocab_to_int.json', 'r') as f:
|
| 57 |
+
vocab_to_int = json.load(f)
|
| 58 |
+
|
| 59 |
+
word2vec_model = gensim.models.Word2Vec.load("lstm/word2vec.model")
|
| 60 |
+
|
| 61 |
+
stop_words = ['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между']
|
| 62 |
+
|
| 63 |
+
def data_preprocessing(text: str) -> str:
|
| 64 |
+
text = text.lower()
|
| 65 |
+
text = re.sub('<.*?>', '', text) # html tags
|
| 66 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
| 67 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
| 68 |
+
text = [word for word in text.split() if not word.isdigit()]
|
| 69 |
+
text = ' '.join(text)
|
| 70 |
+
return text
|
| 71 |
+
|
| 72 |
+
# Функция для предсказания класса отзыва
|
| 73 |
+
def classify_review_tf(review):
|
| 74 |
+
# Векторизация отзыва
|
| 75 |
+
review_vector = vectorizer_tf.transform([review])
|
| 76 |
+
# Предсказание
|
| 77 |
+
start_time = time.time()
|
| 78 |
+
prediction = model_tf.predict(review_vector)
|
| 79 |
+
end_time = time.time()
|
| 80 |
+
# Время предсказания
|
| 81 |
+
prediction_time = end_time - start_time
|
| 82 |
+
return prediction[0], prediction_time
|
| 83 |
+
|
| 84 |
+
VOCAB_SIZE = len(vocab_to_int) + 1 # add 1 for the padding token
|
| 85 |
+
EMBEDDING_DIM = 32
|
| 86 |
+
HIDDEN_SIZE = 32
|
| 87 |
+
SEQ_LEN = 100
|
| 88 |
+
|
| 89 |
+
class BahdanauAttention(nn.Module):
|
| 90 |
+
def __init__(self, hidden_size: torch.Tensor = HIDDEN_SIZE) -> None:
|
| 91 |
+
super().__init__()
|
| 92 |
+
|
| 93 |
+
self.W_q = nn.Linear(hidden_size, hidden_size)
|
| 94 |
+
self.W_k = nn.Linear(hidden_size, hidden_size)
|
| 95 |
+
self.V = nn.Linear(HIDDEN_SIZE, 1)
|
| 96 |
+
|
| 97 |
+
def forward(
|
| 98 |
+
self,
|
| 99 |
+
keys: torch.Tensor,
|
| 100 |
+
query: torch.Tensor
|
| 101 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 102 |
+
query = self.W_q(query)
|
| 103 |
+
keys = self.W_k(keys)
|
| 104 |
+
|
| 105 |
+
energy = self.V(torch.tanh(query.unsqueeze(1) + keys)).squeeze(-1)
|
| 106 |
+
weights = F.softmax(energy, -1)
|
| 107 |
+
context = torch.bmm(weights.unsqueeze(1), keys)
|
| 108 |
+
return context, weights
|
| 109 |
+
|
| 110 |
+
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
|
| 111 |
+
embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
|
| 112 |
+
|
| 113 |
+
class LSTMConcatAttention(nn.Module):
|
| 114 |
+
def __init__(self) -> None:
|
| 115 |
+
super().__init__()
|
| 116 |
+
|
| 117 |
+
# self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
|
| 118 |
+
self.embedding = embedding_layer
|
| 119 |
+
self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
|
| 120 |
+
self.attn = BahdanauAttention(HIDDEN_SIZE)
|
| 121 |
+
self.clf = nn.Sequential(
|
| 122 |
+
nn.Linear(HIDDEN_SIZE, 128),
|
| 123 |
+
nn.Dropout(),
|
| 124 |
+
nn.Tanh(),
|
| 125 |
+
nn.Linear(128, 1)
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
def forward(self, x):
|
| 129 |
+
embeddings = self.embedding(x)
|
| 130 |
+
outputs, (h_n, _) = self.lstm(embeddings)
|
| 131 |
+
att_hidden, att_weights = self.attn(outputs, h_n.squeeze(0))
|
| 132 |
+
out = self.clf(att_hidden)
|
| 133 |
+
return out, att_weights
|
| 134 |
+
|
| 135 |
+
model_lstm = LSTMConcatAttention() # Инициализируйте с теми же параметрами, что использовались при обучении
|
| 136 |
+
model_lstm.load_state_dict(torch.load("lstm/lstm_model.pth"))
|
| 137 |
+
model_lstm.eval()
|
| 138 |
+
|
| 139 |
+
# Проверка и добавление токена <UNK>, если он отсутствует
|
| 140 |
+
if '<UNK>' not in vocab_to_int:
|
| 141 |
+
vocab_to_int['<UNK>'] = len(vocab_to_int) # Присвоение нового уникального индекса
|
| 142 |
+
|
| 143 |
+
# Проверка и добавление токена <PAD>, если он отсутствует
|
| 144 |
+
if '<PAD>' not in vocab_to_int:
|
| 145 |
+
vocab_to_int['<PAD>'] = len(vocab_to_int) # Присвоение нового уникального индекса
|
| 146 |
+
|
| 147 |
+
def text_to_vector(text, max_length=SEQ_LEN):
|
| 148 |
+
words = text.split()
|
| 149 |
+
vector = [vocab_to_int.get(word, vocab_to_int["<UNK>"]) for word in words][:max_length]
|
| 150 |
+
vector += [vocab_to_int["<PAD>"]] * (max_length - len(vector)) # Дополнение вектора
|
| 151 |
+
return np.array(vector, dtype=np.int64) # Убедитесь, что тип данных int64
|
| 152 |
+
|
| 153 |
+
def classify_review_lstm(review):
|
| 154 |
+
# Векторизация отзыва
|
| 155 |
+
review_vector = text_to_vector(review)
|
| 156 |
+
# Преобразование в тензор PyTorch и добавление размерности пакета (batch)
|
| 157 |
+
review_tensor = torch.tensor(review_vector).unsqueeze(0)
|
| 158 |
+
|
| 159 |
+
# Предсказание
|
| 160 |
+
start_time = time.time()
|
| 161 |
+
with torch.no_grad():
|
| 162 |
+
prediction, _ = model_lstm(review_tensor)
|
| 163 |
+
end_time = time.time()
|
| 164 |
+
|
| 165 |
+
# Время предсказания
|
| 166 |
+
prediction_time = end_time - start_time
|
| 167 |
+
return prediction, prediction_time
|
| 168 |
+
|
| 169 |
+
# Создание интерфейса Streamlit
|
| 170 |
+
st.title('Классификатор отзывов на клиники')
|
| 171 |
+
|
| 172 |
+
# Текстовое поле для ввода отзыва
|
| 173 |
+
user_review = st.text_input('Введите ваш отзыв на клинику')
|
| 174 |
+
|
| 175 |
+
if st.button('Классифицировать'):
|
| 176 |
+
if user_review:
|
| 177 |
+
# Классификация отзыва
|
| 178 |
+
prediction_tf, pred_time_tf = classify_review_tf(user_review)
|
| 179 |
+
st.write(f'Предсказанный класс TF-IDF: {prediction_tf}')
|
| 180 |
+
st.write(f'Время предсказания TF-IDF: {pred_time_tf:.4f} секунд')
|
| 181 |
+
prediction_lstm, pred_time_lstm = classify_review_lstm(user_review)
|
| 182 |
+
st.write(f'Предсказанный класс LSTM: {prediction_tf}')
|
| 183 |
+
st.write(f'Время предсказания LSTM: {pred_time_tf:.4f} секунд')
|
| 184 |
+
else:
|
| 185 |
+
st.write('Пожалуйста, введите отзыв')
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# if page == "Определение токсичности":
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# if page == "Генерация текста":
|
| 195 |
+
|
| 196 |
+
|
lstm/lstm_model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6fd3a156141324e2c9eb10ac3458a0e74ebb9c49aef162c5794a415f91de81f
|
| 3 |
+
size 11341922
|
lstm/rnn_preprocessing.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import string
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
from nltk.corpus import stopwords
|
| 7 |
+
stop_words = set(stopwords.words('english'))
|
| 8 |
+
|
| 9 |
+
def data_preprocessing(text: str) -> str:
|
| 10 |
+
"""preprocessing string: lowercase, removing html-tags, punctuation,
|
| 11 |
+
stopwords, digits
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
text (str): input string for preprocessing
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
str: preprocessed string
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
text = text.lower()
|
| 21 |
+
text = re.sub('<.*?>', '', text) # html tags
|
| 22 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
| 23 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
| 24 |
+
text = [word for word in text.split() if not word.isdigit()]
|
| 25 |
+
text = ' '.join(text)
|
| 26 |
+
return text
|
| 27 |
+
|
| 28 |
+
def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
|
| 29 |
+
return list(filter(lambda x: x[1] > n, sorted_words))
|
| 30 |
+
|
| 31 |
+
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
|
| 32 |
+
"""Make left-sided padding for input list of tokens
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
review_int (list): input list of tokens
|
| 36 |
+
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
np.array: padded sequences
|
| 40 |
+
"""
|
| 41 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
| 42 |
+
for i, review in enumerate(review_int):
|
| 43 |
+
if len(review) <= seq_len:
|
| 44 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
| 45 |
+
new = zeros + review
|
| 46 |
+
else:
|
| 47 |
+
new = review[: seq_len]
|
| 48 |
+
features[i, :] = np.array(new)
|
| 49 |
+
|
| 50 |
+
return features
|
| 51 |
+
|
| 52 |
+
def preprocess_single_string(
|
| 53 |
+
input_string: str,
|
| 54 |
+
seq_len: int,
|
| 55 |
+
vocab_to_int: dict,
|
| 56 |
+
verbose : bool = False
|
| 57 |
+
) -> torch.tensor:
|
| 58 |
+
"""Function for all preprocessing steps on a single string
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
input_string (str): input single string for preprocessing
|
| 62 |
+
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
|
| 63 |
+
vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
list: preprocessed string
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
preprocessed_string = data_preprocessing(input_string)
|
| 70 |
+
result_list = []
|
| 71 |
+
for word in preprocessed_string.split():
|
| 72 |
+
try:
|
| 73 |
+
result_list.append(vocab_to_int[word])
|
| 74 |
+
except KeyError as e:
|
| 75 |
+
if verbose:
|
| 76 |
+
print(f'{e}: not in dictionary!')
|
| 77 |
+
pass
|
| 78 |
+
result_padded = padding([result_list], seq_len)[0]
|
| 79 |
+
|
| 80 |
+
return torch.tensor(result_padded)
|
lstm/vocab_to_int.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lstm/word2vec.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6026f577c772a215706fdc1afa07bdbf069b30dc9f65b658bc638c52d6d79611
|
| 3 |
+
size 1251312
|
requirements.txt
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
altair==5.2.0
|
| 2 |
+
attrs==23.1.0
|
| 3 |
+
blinker==1.7.0
|
| 4 |
+
cachetools==5.3.2
|
| 5 |
+
certifi==2023.11.17
|
| 6 |
+
charset-normalizer==3.3.2
|
| 7 |
+
click==8.1.7
|
| 8 |
+
contourpy==1.2.0
|
| 9 |
+
cycler==0.12.1
|
| 10 |
+
filelock==3.13.1
|
| 11 |
+
fonttools==4.46.0
|
| 12 |
+
fsspec==2023.12.1
|
| 13 |
+
gensim==4.3.2
|
| 14 |
+
gitdb==4.0.11
|
| 15 |
+
GitPython==3.1.40
|
| 16 |
+
huggingface-hub==0.19.4
|
| 17 |
+
idna==3.6
|
| 18 |
+
importlib-metadata==6.11.0
|
| 19 |
+
importlib-resources==6.1.1
|
| 20 |
+
Jinja2==3.1.2
|
| 21 |
+
joblib==1.3.2
|
| 22 |
+
jsonschema==4.20.0
|
| 23 |
+
jsonschema-specifications==2023.11.2
|
| 24 |
+
kiwisolver==1.4.5
|
| 25 |
+
lightning-utilities==0.10.0
|
| 26 |
+
markdown-it-py==3.0.0
|
| 27 |
+
MarkupSafe==2.1.3
|
| 28 |
+
matplotlib==3.8.2
|
| 29 |
+
mdurl==0.1.2
|
| 30 |
+
mpmath==1.3.0
|
| 31 |
+
networkx==3.2.1
|
| 32 |
+
nltk==3.8.1
|
| 33 |
+
numpy==1.26.2
|
| 34 |
+
packaging==23.2
|
| 35 |
+
pandas==2.1.3
|
| 36 |
+
Pillow==10.1.0
|
| 37 |
+
protobuf==4.25.1
|
| 38 |
+
pyarrow==14.0.1
|
| 39 |
+
pydeck==0.8.1b0
|
| 40 |
+
Pygments==2.17.2
|
| 41 |
+
pyparsing==3.1.1
|
| 42 |
+
python-dateutil==2.8.2
|
| 43 |
+
pytz==2023.3.post1
|
| 44 |
+
PyYAML==6.0.1
|
| 45 |
+
referencing==0.32.0
|
| 46 |
+
regex==2023.10.3
|
| 47 |
+
requests==2.31.0
|
| 48 |
+
rich==13.7.0
|
| 49 |
+
rpds-py==0.13.2
|
| 50 |
+
safetensors==0.4.1
|
| 51 |
+
scikit-learn==1.3.2
|
| 52 |
+
scipy==1.11.4
|
| 53 |
+
six==1.16.0
|
| 54 |
+
smart-open==6.4.0
|
| 55 |
+
smmap==5.0.1
|
| 56 |
+
streamlit==1.29.0
|
| 57 |
+
sympy==1.12
|
| 58 |
+
tenacity==8.2.3
|
| 59 |
+
threadpoolctl==3.2.0
|
| 60 |
+
tokenizers==0.15.0
|
| 61 |
+
toml==0.10.2
|
| 62 |
+
toolz==0.12.0
|
| 63 |
+
torch==2.1.1
|
| 64 |
+
torchmetrics==1.2.1
|
| 65 |
+
tornado==6.4
|
| 66 |
+
tqdm==4.66.1
|
| 67 |
+
transformers==4.35.2
|
| 68 |
+
typing_extensions==4.8.0
|
| 69 |
+
tzdata==2023.3
|
| 70 |
+
tzlocal==5.2
|
| 71 |
+
urllib3==2.1.0
|
| 72 |
+
validators==0.22.0
|
| 73 |
+
zipp==3.17.0
|
tf-idf/tf-idf.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26ac8a2985942f283db82f07dbe2124b18e50d05c0f1e98ede95338a26911b42
|
| 3 |
+
size 529407
|
tf-idf/tf-idf_vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:772dfeb1b7264ebf743229968e7d272f5aef1eb24911672708a485dc0421f147
|
| 3 |
+
size 2667929
|