Spaces:

sebastiansarasti
/

AI_written_text_identification

Sleeping

App Files Files Community

sebastiansarasti commited on Jun 17

Commit

fa08326

0 Parent(s):

first commit

Browse files

Files changed (21) hide show

.github/workflows/hugggingface.yaml +17 -0
Dockerfile +33 -0
README.md +11 -0
requirements.txt +9 -0
src/app/__init__.py +0 -0
src/app/__pycache__/pipelines.cpython-311.pyc +0 -0
src/app/__pycache__/xai.cpython-311.pyc +0 -0
src/app/main.py +89 -0
src/app/pipelines.py +28 -0
src/app/requirements.txt +11 -0
src/app/test.ipynb +510 -0
src/app/utils/__init__.py +0 -0
src/app/utils/__pycache__/__init__.cpython-311.pyc +0 -0
src/app/utils/__pycache__/log_model.cpython-311.pyc +0 -0
src/app/utils/__pycache__/text_features.cpython-311.pyc +0 -0
src/app/utils/__pycache__/text_processing.cpython-311.pyc +0 -0
src/app/utils/download_model.py +23 -0
src/app/utils/log_model.py +56 -0
src/app/utils/text_features.py +70 -0
src/app/utils/text_processing.py +160 -0
src/app/xai.py +27 -0

.github/workflows/hugggingface.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://${{ secrets.HF_USERNAME }}:[email protected]/spaces/${{ secrets.HF_USERNAME }}/${{ secrets.SPACE_NAME }} main

Dockerfile ADDED Viewed

	@@ -0,0 +1,33 @@

+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Copy files
+COPY src/app ./src/app
+# Install uv and Python packages
+RUN pip install uv
+RUN uv pip install --system -r /src/app/requirements.txt
+# Create non-root user and give permissions
+RUN useradd -m appuser && \
+    mkdir -p /app/cache /app/.streamlit && \
+    chown -R appuser:appuser /app
+# Set environment variables for Hugging Face and Streamlit
+ENV HF_HOME=/app/cache
+ENV STREAMLIT_CONFIG_DIR=/app/.streamlit
+# Switch to non-root user
+USER appuser
+# Expose Streamlit port
+EXPOSE 8501
+# Healthcheck
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
+# Run Streamlit app
+ENTRYPOINT ["streamlit", "run", "src/app/main.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: NLP conference Crossbridge
+app_port: 8501
+emoji: 🈂️
+colorFrom: gray
+colorTo: purple
+sdk: docker
+pinned: false
+license: mit
+short_description: Traditional NLP for AI written detection
+---

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+numpy == 1.26.4
+pandas == 2.2.0
+pyarrow == 15.0.0
+fastparquet == 2024.2.0
+mlflow == 2.10.2
+nltk == 3.8.1
+seaborn == 0.13.2
+matplotlib == 3.8.2
+python-dotenv == 1.0.1

src/app/__init__.py ADDED Viewed

File without changes

src/app/__pycache__/pipelines.cpython-311.pyc ADDED Viewed

Binary file (2.64 kB). View file

src/app/__pycache__/xai.cpython-311.pyc ADDED Viewed

Binary file (1.66 kB). View file

src/app/main.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import streamlit as st
+import sys
+from pipelines import pipeline_inference
+from xai import get_explanation
+import time
+import pandas as pd
+import plotly.express as px
+import nltk
+nltk.download('stopwords')
+st.title('Text identification app')
+st.subheader('This app is designed to identify if a text was written by a human or an AI')
+st.markdown('In many cases, using AI is not a suitable solution because this does not allow to develop creativity and innovation in written assessments')
+col1, col2 = st.columns(2)
+with col1:
+    a = st.button('Classify text')
+with col2:
+    xai_option = st.toggle('Explain the classification', value = False)
+with st.sidebar:
+    st.subheader('About the App')
+    st.markdown('Data used for the training come from the following source: https://www.kaggle.com/datasets/shanegerami/ai-vs-human-text')
+    st.markdown('The model built is not based on transformer architecture, it uses traditional Natural Language Processing techniques')
+    st.empty()
+    st.subheader('Author')
+    st.markdown('Sebastián Sarasti Zambonino')
+    st.markdown('Data Scientist - Machine Learning Engineer')
+    st.markdown('https://www.linkedin.com/in/sebastiansarasti/')
+    st.markdown('https://github.com/sebassaras02')
+text_input = st.text_area('Enter the text to classify', height = 200)
+result = None
+if a and not xai_option:
+    if text_input:
+        with st.spinner('Classifying the text, wait please ...'):
+            time.sleep(1)
+        result = pipeline_inference(text_input)
+        st.subheader('Probability that the text was classified as:')
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric('Human written', result[0][0] )
+        with col2:
+            st.metric('AI written', result[0][1])
+        if result[0][1]>0.6:
+            st.warning('High probability that the text was written by an AI')
+        else:
+            st.success('High probability that the text was written by a human')
+    else:
+        st.exception('Please enter the text to classify, no text was provided')
+elif a and xai_option:
+    if text_input:
+        with st.spinner('Classifying the text, wait please ...'):
+            time.sleep(1)
+        result = pipeline_inference(text_input)
+        st.subheader('Probability that the text was classified as:')
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric('Human written', result[0][0] )
+        with col2:
+            st.metric('AI written', result[0][1])
+        if result[0][1]>0.6:
+            st.warning('High probability that the text was written by an AI')
+        else:
+            st.success('High probability that the text was written by a human')
+        with st.spinner('Explaining the classification, wait please ...'):
+            explanation = get_explanation(text_input)
+            df = pd.DataFrame(list(explanation.items()), columns=['Palabras', 'Números'])
+            df['Signo'] = ['Positivo' if x >= 0 else 'Negativo' for x in df['Números']]
+            df = df.sort_values('Números', ascending=False)
+            df = df.rename(columns={'Palabras': 'Words', 'Números': 'Frequency', 'Signo': 'Type'})
+            df['Type'] = df['Type'].map({'Positivo': 'IA Pattern', 'Negativo': 'Humman Pattern'})
+            fig = px.bar(df, y='Words', x='Frequency', color='Type', color_discrete_map={'IA Pattern': 'red', 'Humman Pattern': 'blue'})
+            st.subheader('Explanation of the classification:')
+            st.markdown('The following words are the most important to classify the text:')
+            st.plotly_chart(fig)

src/app/pipelines.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import numpy as np
+import pandas as pd
+import re
+import mlflow
+from joblib import dump, load
+import sys
+from utils.text_processing import TextProcessing
+def pipeline_inference(input : str):
+    # load tf-idf model
+    tfidf_model = load('models/tfidf_model.joblib')
+    # load pca model
+    pca_model = load('models/pca_model.joblib')
+    # load the model
+    classifier_model = load('models/classifier_model.joblib')
+    # preprocess the input
+    text_processing = TextProcessing()
+    text_processed = text_processing.fit_transform_text(input)
+    vector = tfidf_model.transform([text_processed])
+    vector_pca = pca_model.transform(vector)
+    # make a vector with the pca values
+    df = pd.DataFrame(vector_pca, columns = ["dim1", "dim2", "dim3", "dim4", "dim5"])
+    # make the prediction
+    prediction = classifier_model.predict_proba(df)
+    return prediction

src/app/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+numpy==1.26.4
+pandas==2.2.0
+pyarrow==15.0.0
+fastparquet==2024.2.0
+mlflow==2.10.2
+nltk==3.8.1
+seaborn==0.13.2
+matplotlib==3.8.2
+python-dotenv==1.0.1
+plotly==5.19.0
+lime==0.2.0.1

src/app/test.ipynb ADDED Viewed

	@@ -0,0 +1,510 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mlflow\n",
+    "from dotenv import load_dotenv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "load_dotenv('../../.env')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tfidf_logged_model = 'runs:/a63128b897bd4f91a06f20939a715b98/tfidf_model'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\sebit\\.conda\\envs\\mlops\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00,  2.50it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "tfidf_model = mlflow.sklearn.load_model(tfidf_logged_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>#sk-container-id-1 {\n",
+       "  /* Definition of color scheme common for light and dark mode */\n",
+       "  --sklearn-color-text: black;\n",
+       "  --sklearn-color-line: gray;\n",
+       "  /* Definition of color scheme for unfitted estimators */\n",
+       "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
+       "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
+       "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
+       "  --sklearn-color-unfitted-level-3: chocolate;\n",
+       "  /* Definition of color scheme for fitted estimators */\n",
+       "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
+       "  --sklearn-color-fitted-level-1: #d4ebff;\n",
+       "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
+       "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
+       "\n",
+       "  /* Specific color for light theme */\n",
+       "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
+       "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
+       "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
+       "  --sklearn-color-icon: #696969;\n",
+       "\n",
+       "  @media (prefers-color-scheme: dark) {\n",
+       "    /* Redefinition of color scheme for dark theme */\n",
+       "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
+       "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
+       "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
+       "    --sklearn-color-icon: #878787;\n",
+       "  }\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 pre {\n",
+       "  padding: 0;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-hidden--visually {\n",
+       "  border: 0;\n",
+       "  clip: rect(1px 1px 1px 1px);\n",
+       "  clip: rect(1px, 1px, 1px, 1px);\n",
+       "  height: 1px;\n",
+       "  margin: -1px;\n",
+       "  overflow: hidden;\n",
+       "  padding: 0;\n",
+       "  position: absolute;\n",
+       "  width: 1px;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-dashed-wrapped {\n",
+       "  border: 1px dashed var(--sklearn-color-line);\n",
+       "  margin: 0 0.4em 0.5em 0.4em;\n",
+       "  box-sizing: border-box;\n",
+       "  padding-bottom: 0.4em;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-container {\n",
+       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
+       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
+       "     so we also need the `!important` here to be able to override the\n",
+       "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
+       "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
+       "  display: inline-block !important;\n",
+       "  position: relative;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-text-repr-fallback {\n",
+       "  display: none;\n",
+       "}\n",
+       "\n",
+       "div.sk-parallel-item,\n",
+       "div.sk-serial,\n",
+       "div.sk-item {\n",
+       "  /* draw centered vertical line to link estimators */\n",
+       "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
+       "  background-size: 2px 100%;\n",
+       "  background-repeat: no-repeat;\n",
+       "  background-position: center center;\n",
+       "}\n",
+       "\n",
+       "/* Parallel-specific style estimator block */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item::after {\n",
+       "  content: \"\";\n",
+       "  width: 100%;\n",
+       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
+       "  flex-grow: 1;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel {\n",
+       "  display: flex;\n",
+       "  align-items: stretch;\n",
+       "  justify-content: center;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  position: relative;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
+       "  align-self: flex-end;\n",
+       "  width: 50%;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
+       "  align-self: flex-start;\n",
+       "  width: 50%;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
+       "  width: 0;\n",
+       "}\n",
+       "\n",
+       "/* Serial-specific style estimator block */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-serial {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "  align-items: center;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  padding-right: 1em;\n",
+       "  padding-left: 1em;\n",
+       "}\n",
+       "\n",
+       "\n",
+       "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
+       "clickable and can be expanded/collapsed.\n",
+       "- Pipeline and ColumnTransformer use this feature and define the default style\n",
+       "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
+       "*/\n",
+       "\n",
+       "/* Pipeline and ColumnTransformer style (default) */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable {\n",
+       "  /* Default theme specific background. It is overwritten whether we have a\n",
+       "  specific estimator or a Pipeline/ColumnTransformer */\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "}\n",
+       "\n",
+       "/* Toggleable label */\n",
+       "#sk-container-id-1 label.sk-toggleable__label {\n",
+       "  cursor: pointer;\n",
+       "  display: block;\n",
+       "  width: 100%;\n",
+       "  margin-bottom: 0;\n",
+       "  padding: 0.5em;\n",
+       "  box-sizing: border-box;\n",
+       "  text-align: center;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
+       "  /* Arrow on the left of the label */\n",
+       "  content: \"▸\";\n",
+       "  float: left;\n",
+       "  margin-right: 0.25em;\n",
+       "  color: var(--sklearn-color-icon);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "}\n",
+       "\n",
+       "/* Toggleable content - dropdown */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content {\n",
+       "  max-height: 0;\n",
+       "  max-width: 0;\n",
+       "  overflow: hidden;\n",
+       "  text-align: left;\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content pre {\n",
+       "  margin: 0.2em;\n",
+       "  border-radius: 0.25em;\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
+       "  /* Expand drop-down */\n",
+       "  max-height: 200px;\n",
+       "  max-width: 100%;\n",
+       "  overflow: auto;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
+       "  content: \"▾\";\n",
+       "}\n",
+       "\n",
+       "/* Pipeline/ColumnTransformer-specific style */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Estimator-specific style */\n",
+       "\n",
+       "/* Colorize estimator box */\n",
+       "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
+       "#sk-container-id-1 div.sk-label label {\n",
+       "  /* The background is the default theme color */\n",
+       "  color: var(--sklearn-color-text-on-default-background);\n",
+       "}\n",
+       "\n",
+       "/* On hover, darken the color of the background */\n",
+       "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Label box, darken color on hover, fitted */\n",
+       "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Estimator label */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label label {\n",
+       "  font-family: monospace;\n",
+       "  font-weight: bold;\n",
+       "  display: inline-block;\n",
+       "  line-height: 1.2em;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label-container {\n",
+       "  text-align: center;\n",
+       "}\n",
+       "\n",
+       "/* Estimator-specific */\n",
+       "#sk-container-id-1 div.sk-estimator {\n",
+       "  font-family: monospace;\n",
+       "  border: 1px dotted var(--sklearn-color-border-box);\n",
+       "  border-radius: 0.25em;\n",
+       "  box-sizing: border-box;\n",
+       "  margin-bottom: 0.5em;\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "/* on hover */\n",
+       "#sk-container-id-1 div.sk-estimator:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
+       "\n",
+       "/* Common style for \"i\" and \"?\" */\n",
+       "\n",
+       ".sk-estimator-doc-link,\n",
+       "a:link.sk-estimator-doc-link,\n",
+       "a:visited.sk-estimator-doc-link {\n",
+       "  float: right;\n",
+       "  font-size: smaller;\n",
+       "  line-height: 1em;\n",
+       "  font-family: monospace;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  border-radius: 1em;\n",
+       "  height: 1em;\n",
+       "  width: 1em;\n",
+       "  text-decoration: none !important;\n",
+       "  margin-left: 1ex;\n",
+       "  /* unfitted */\n",
+       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-unfitted-level-1);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link.fitted,\n",
+       "a:link.sk-estimator-doc-link.fitted,\n",
+       "a:visited.sk-estimator-doc-link.fitted {\n",
+       "  /* fitted */\n",
+       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-fitted-level-1);\n",
+       "}\n",
+       "\n",
+       "/* On hover */\n",
+       "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
+       ".sk-estimator-doc-link:hover,\n",
+       "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
+       ".sk-estimator-doc-link:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
+       ".sk-estimator-doc-link.fitted:hover,\n",
+       "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
+       ".sk-estimator-doc-link.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "/* Span, style for the box shown on hovering the info icon */\n",
+       ".sk-estimator-doc-link span {\n",
+       "  display: none;\n",
+       "  z-index: 9999;\n",
+       "  position: relative;\n",
+       "  font-weight: normal;\n",
+       "  right: .2ex;\n",
+       "  padding: .5ex;\n",
+       "  margin: .5ex;\n",
+       "  width: min-content;\n",
+       "  min-width: 20ex;\n",
+       "  max-width: 50ex;\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  box-shadow: 2pt 2pt 4pt #999;\n",
+       "  /* unfitted */\n",
+       "  background: var(--sklearn-color-unfitted-level-0);\n",
+       "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link.fitted span {\n",
+       "  /* fitted */\n",
+       "  background: var(--sklearn-color-fitted-level-0);\n",
+       "  border: var(--sklearn-color-fitted-level-3);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link:hover span {\n",
+       "  display: block;\n",
+       "}\n",
+       "\n",
+       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link {\n",
+       "  float: right;\n",
+       "  font-size: 1rem;\n",
+       "  line-height: 1em;\n",
+       "  font-family: monospace;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  border-radius: 1rem;\n",
+       "  height: 1rem;\n",
+       "  width: 1rem;\n",
+       "  text-decoration: none;\n",
+       "  /* unfitted */\n",
+       "  color: var(--sklearn-color-unfitted-level-1);\n",
+       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
+       "  /* fitted */\n",
+       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-fitted-level-1);\n",
+       "}\n",
+       "\n",
+       "/* On hover */\n",
+       "#sk-container-id-1 a.estimator_doc_link:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-3);\n",
+       "}\n",
+       "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>TfidfVectorizer(max_df=0.95, max_features=2000, min_df=0.1)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;TfidfVectorizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\">?<span>Documentation for TfidfVectorizer</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>TfidfVectorizer(max_df=0.95, max_features=2000, min_df=0.1)</pre></div> </div></div></div></div>"
+      ],
+      "text/plain": [
+       "TfidfVectorizer(max_df=0.95, max_features=2000, min_df=0.1)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tfidf_model"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mlops",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/app/utils/__init__.py ADDED Viewed

File without changes

src/app/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (167 Bytes). View file

src/app/utils/__pycache__/log_model.cpython-311.pyc ADDED Viewed

Binary file (4.4 kB). View file

src/app/utils/__pycache__/text_features.cpython-311.pyc ADDED Viewed

Binary file (4.43 kB). View file

src/app/utils/__pycache__/text_processing.cpython-311.pyc ADDED Viewed

Binary file (9.83 kB). View file

src/app/utils/download_model.py ADDED Viewed

	@@ -0,0 +1,23 @@

+def pipeline_download_models():
+    """
+    This function downloads the models from the mlflow server and saves them in the models folder
+    Args:
+        None
+    Returns:
+        None
+    """
+    load_dotenv('../../.env')
+    # download the tf-idf model
+    tfidf_logged_model = 'runs:/a63128b897bd4f91a06f20939a715b98/tfidf_model'
+    tfidf_model = mlflow.sklearn.load_model(tfidf_logged_model)
+    dump(tfidf_model, '../../models/tfidf_model.joblib')
+    # download the pca model
+    pca_logged_model = 'runs:/a63128b897bd4f91a06f20939a715b98/pca_model'
+    pca_model = mlflow.sklearn.load_model(pca_logged_model)
+    dump(pca_model, '../../models/pca_model.joblib')
+    # download the classifier
+    classifier_logged_model = 'runs:/49483b7a0f95430a8492a448ac13e8d7/random-forest'
+    classifier_model = mlflow.sklearn.load_model(classifier_logged_model)
+    dump(classifier_model, '../../models/classifier_model.joblib')

src/app/utils/log_model.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import mlflow
+from datetime import datetime
+from sklearn.metrics import classification_report
+class LogModel:
+    def __init__(self, mlflow_uri : str, mlflow_experiment_name : str, mlflow_run_name : str, X_train, Y_train, X_test, Y_test, model, model_name) -> None:
+        self.mlflow_uri = mlflow_uri
+        self.mlflow_experiment_name = mlflow_experiment_name
+        self.mlflow_run_name = mlflow_run_name
+        self.X_train = X_train
+        self.Y_train = Y_train
+        self.X_test = X_test
+        self.Y_test = Y_test
+        self.model_name = model_name
+        self.model = model
+        # set the mlflow uri
+        mlflow.set_tracking_uri(self.mlflow_uri)
+        mlflow.set_experiment(self.mlflow_experiment_name)
+    def evaluate_train_data(self):
+        """
+        This function evaluates the model on the training data
+        """
+        self.report1 = classification_report(self.Y_test, self.model.predict(self.X_test), output_dict=True)
+        mlflow.log_metric("accuracy", self.report1.pop("accuracy"))
+        for class_or_avg, metrics_dict in self.report1.items():
+            for metric, value in metrics_dict.items():
+                mlflow.log_metric(class_or_avg + '_' + metric,value)
+    def evaluate_test_data(self):
+        """
+        This function evaluates the model on the test data
+        """
+        self.report2 = classification_report(self.Y_test, self.model.predict(self.X_test), output_dict=True)
+        mlflow.log_metric("accuracy", self.report2.pop("accuracy"))
+        for class_or_avg, metrics_dict in self.report2.items():
+            for metric, value in metrics_dict.items():
+                mlflow.log_metric(class_or_avg + '_' + metric,value)
+    def register_model(self):
+        """
+        This function register the model created parameters and the model
+        """
+        params = self.model.get_params()
+        mlflow.log_params(params)
+        mlflow.sklearn.log_model(self.model, self.model_name)
+    def fit_transform(self):
+        with mlflow.start_run(run_name = self.mlflow_run_name + " " + datetime.today().strftime("%Y-%m-%d %H:%M:%S")):
+            self.evaluate_train_data()
+            self.evaluate_test_data()
+            self.register_model()
+            mlflow.end_run()
+        print("Model performance over the test dataset")
+        print(self.report2)

src/app/utils/text_features.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from sklearn.feature_extraction.text import TfidfVectorizer
+import pandas as pd
+from joblib import dump
+import numpy as np
+from sklearn.decomposition import PCA
+import mlflow
+from datetime import datetime
+class FeatureTextExtraction:
+    def __init__(self, mlflow_uri : str, mlflow_experiment_name : str, mlflow_run_name : str) -> None:
+        self.vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.1, max_features=2000)
+        self.pca = PCA(5, random_state=99)
+        self.mlflow_uri = mlflow_uri
+        self.mlflow_experiment_name = mlflow_experiment_name
+        self.mlflow_run_name = mlflow_run_name
+        # set the mlflow uri
+        mlflow.set_tracking_uri(self.mlflow_uri)
+        mlflow.set_experiment(self.mlflow_experiment_name)
+    def fit_tfidf(self, df: pd.DataFrame) -> None:
+        """
+        This function fits the model to the data
+        Args:
+            df: pd.DataFrame: The dataframe containing the data
+        Returns:
+            None
+        """
+        self.df = df
+        self.df = self.df.dropna(subset=["processed_text"])
+        self.matrix = self.vectorizer.fit_transform(df["processed_text"])
+    def dimesion_reduction(self) -> pd.DataFrame:
+        """
+        This function reduces the dimension of the data
+        Returns:
+            pd.DataFrame: The dataframe containing the transformed data
+        """
+        self.reduced_data = self.pca.fit_transform(self.matrix.toarray())
+        # convert to dataframe
+        self.reduced_df = pd.DataFrame(self.reduced_data, columns=["dim1", "dim2", "dim3", "dim4", "dim5"])
+        return self.reduced_df
+    def fit_transform(self, df : pd.DataFrame) -> pd.DataFrame:
+        """
+        This function fits the model to the data
+        Args:
+            df: pd.DataFrame: The dataframe containing the data
+        Returns:
+            pd.DataFrame: The dataframe containing the transformed data
+        """
+        with mlflow.start_run(run_name = self.mlflow_run_name + " " + datetime.today().strftime("%Y-%m-%d %H:%M:%S")):
+            # log the parameters of the TF-IDF model
+            self.fit_tfidf(df)
+            # log the model of the TF-IDF model
+            mlflow.sklearn.log_model(self.vectorizer, "tfidf_model")
+            # log the parameters of the PCA model
+            self.data = self.dimesion_reduction()
+            # log the model of the PCA model
+            mlflow.sklearn.log_model(self.pca, "pca_model")
+            # end the run
+            mlflow.end_run()
+            # delete the parameters
+            self.final_df = pd.concat([self.df, self.data], axis=1)
+        return self.final_df

src/app/utils/text_processing.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import pandas as pd
+from nltk.stem import PorterStemmer
+import re
+class TextProcessing:
+    """
+    This class contains all methods to process text data.
+    """
+    def __init__(self, language : str = 'english'):
+        self.list_stopwords = list(set(stopwords.words(language)))
+        self.lemmatizer = WordNetLemmatizer()
+        self.stemmer = PorterStemmer()
+    def tokenize(self, text : str) -> list:
+        """
+        This function takes a string and returns a list of words in the string.
+        Args:
+            text : A string of words
+        Returns:
+            the tokens
+        """
+        return text.split()
+    def remove_stopwords(self, list_tokens : list) -> list:
+        """
+        This function removes the stopwords from the list of tokens.
+        Args:
+            list_tokens : list of tokens to process
+        Returns:
+            list of tokens with the stopwords removed
+        """
+        return [word for word in list_tokens if word not in self.list_stopwords]
+    def lemmatize_tokens(self, list_tokens : list) -> list:
+        """
+        This function lemmatizes a list of tokens.
+        Args:
+            list_tokens : list of tokens
+            lemmatizer : instance of WordNetLemmatizer
+        Returns:
+            list of lemmatized tokens
+        """
+        return [self.lemmatizer.lemmatize(word) for word in list_tokens]
+    def steem_tokens(self, list_tokens : list) -> list:
+        """
+        This function steems a list of tokens.
+        Args:
+            list_tokens : list of tokens
+        Returns:
+            list of steemed tokens
+        """
+        return [self.stemmer.stem(word) for word in list_tokens]
+    def lowercase_tokens(self, list_tokens : list) -> list:
+        """"
+        This function receives a list of tokens and returns a list of tokens in lowercase
+        Args:
+            list_tokens: list of strings
+        Returns:
+            list of strings
+        """
+        return [word.lower() for word in list_tokens]
+    def remove_short_tokens(self, token_list : list, min_length : int = 3) -> list:
+        """
+        This function removes words from a list of tokens that are shorter than min_length.
+        Args:
+            token_list: list of strings
+            min_length: int, minimum length of the words to keep
+        Returns:
+            list of strings
+        """
+        return [word for word in token_list if len(word) >= min_length]
+    def remove_punctuation(self, text : str) -> str:
+        """
+        This function removes punctuation from a list of tokens.
+        Args:
+            token_list: list of strings
+        Returns:
+            list of strings
+        """
+        if isinstance(text, bytes):
+            text = text.decode('utf-8')  # Decodificar si es una cadena de bytes
+        text = re.sub(r'[^\w\s]', '', text)
+        text = re.sub(r'\n', '', text)
+        text = re.sub(r'\d', '', text)
+        return text
+    def join_tokens_cleaned(self, token_list : list ) -> list:
+        """
+        This function joins the tokens in a list
+        Args:
+            token_list : list of tokens cleaned
+        Returns:
+            text : final phrase
+        """
+        return " ".join(token_list)
+    def fit_transform(self, df : pd.DataFrame) -> pd.DataFrame:
+        """
+        This function receives a dataframe and applies the text processing methods to the text column.
+        Args:
+            df : pandas DataFrame with a column named 'text'
+        Returns:
+            df : pandas DataFrame with a column named 'processed_text'
+        """
+        df['text'] = df['text'].apply(lambda x: self.remove_punctuation(x))
+        df['processed_text'] = df['text'].apply(lambda x: self.tokenize(x))
+        df['processed_text'] = df['processed_text'].apply(lambda x: self.lowercase_tokens(x))
+        df['processed_text'] = df['processed_text'].apply(lambda x: self.remove_stopwords(x))
+        df['processed_text'] = df['processed_text'].apply(lambda x: self.remove_short_tokens(x))
+        df['processed_text'] = df['processed_text'].apply(lambda x: self.steem_tokens(x))
+        df['processed_text'] = df['processed_text'].apply(lambda x: self.join_tokens_cleaned(x))
+        return df
+    def fit_transform_text(self, text):
+        """
+        This function receives a string and applies the text processing methods to it.
+        Args:
+            text : list with raw texts
+        Returns:
+            text : list with curated texts
+        """
+        text = self.remove_punctuation(text)
+        text = self.tokenize(text)
+        text = self.lowercase_tokens(text)
+        text = self.remove_stopwords(text)
+        text = self.remove_short_tokens(text)
+        text = self.steem_tokens(text)
+        text = self.join_tokens_cleaned(text)
+        return text

src/app/xai.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import numpy as np
+import pandas as pd
+import sys
+from lime.lime_text import LimeTextExplainer
+from pipelines import pipeline_inference
+def f(x):
+    results = np.zeros((len(x), 2))  # Asumiendo que num_classes es la cantidad de clases en tu problema
+    for i, element in enumerate(x):
+        predictions = pipeline_inference(element)
+        results[i, :] = predictions
+    return results
+def get_explanation(text):
+    explainer = LimeTextExplainer(class_names=["Human", "AI"])
+    explanation = explainer.explain_instance(
+            text_instance = text,
+            classifier_fn = f,
+            num_features=30,
+            num_samples = 10
+        )
+    a = explanation.as_list()
+    result = {element[0]: element[1] for element in a}
+    return result