RRA_Chatbot / app.py
UKURIKIYEYEZU's picture
Update app.py
bb182b7 verified
import os
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from urllib.parse import urljoin, urlparse
import requests
from io import BytesIO
from langchain_chroma import Chroma
import requests
from bs4 import BeautifulSoup
from langchain_core.prompts import ChatPromptTemplate
import gradio as gr
from PyPDF2 import PdfReader
from langchain_huggingface import HuggingFaceEmbeddings
groq_api_key= os.environ.get('GBV')
embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
def scrape_websites(base_urls):
try:
visited_links = set() # To avoid revisiting the same link
content_by_url = {} # Store content from each URL
for base_url in base_urls:
if not base_url.strip():
continue # Skip empty or invalid URLs
print(f"Scraping base URL: {base_url}")
html_content = fetch_page_content(base_url)
if html_content:
cleaned_content = clean_body_content(html_content)
content_by_url[base_url] = cleaned_content
visited_links.add(base_url)
# Extract and process all internal links
soup = BeautifulSoup(html_content, "html.parser")
links = extract_internal_links(base_url, soup)
for link in links:
if link not in visited_links:
print(f"Scraping link: {link}")
page_content = fetch_page_content(link)
if page_content:
cleaned_content = clean_body_content(page_content)
content_by_url[link] = cleaned_content
visited_links.add(link)
# If the link is a PDF file, extract its content
if link.lower().endswith('.pdf'):
print(f"Extracting PDF content from: {link}")
pdf_content = extract_pdf_text(link)
if pdf_content:
content_by_url[link] = pdf_content
return content_by_url
except Exception as e:
print(f"Error during scraping: {e}")
return {}
def fetch_page_content(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def extract_internal_links(base_url, soup):
links = set()
for anchor in soup.find_all("a", href=True):
href = anchor["href"]
full_url = urljoin(base_url, href)
if is_internal_link(base_url, full_url):
links.add(full_url)
return links
def is_internal_link(base_url, link_url):
base_netloc = urlparse(base_url).netloc
link_netloc = urlparse(link_url).netloc
return base_netloc == link_netloc
def extract_pdf_text(pdf_url):
try:
response = requests.get(pdf_url)
response.raise_for_status()
with BytesIO(response.content) as file:
reader = PdfReader(file)
pdf_text = ""
for page in reader.pages:
pdf_text += page.extract_text()
return pdf_text if pdf_text else None
except requests.exceptions.RequestException as e:
print(f"Error fetching PDF {pdf_url}: {e}")
return None
except Exception as e:
print(f"Error reading PDF {pdf_url}: {e}")
return None
def clean_body_content(html_content):
soup = BeautifulSoup(html_content, "html.parser")
for script_or_style in soup(["script", "style"]):
script_or_style.extract()
cleaned_content = soup.get_text(separator="\n")
cleaned_content = "\n".join(
line.strip() for line in cleaned_content.splitlines() if line.strip()
)
return cleaned_content
if __name__ == "__main__":
website = [
"https://www.rra.gov.rw/en/customs-services"
]
all_content = scrape_websites(website)
temp_list = []
for url, content in all_content.items():
temp_list.append((url, content))
processed_texts = []
for element in temp_list:
if isinstance(element, tuple):
url, content = element
processed_texts.append(f"url: {url}, content: {content}")
elif isinstance(element, str):
processed_texts.append(element)
else:
processed_texts.append(str(element))
def chunk_string(s, chunk_size=1000):
return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
chunked_texts = []
for text in processed_texts:
chunked_texts.extend(chunk_string(text))
vectorstore = Chroma(
collection_name="R_R_A",
embedding_function=embed_model,
persist_directory="./",
)
vectorstore.get().keys()
vectorstore.add_texts(chunked_texts)
# template = ("""
# You are a friendly and intelligent chatbot designed to assist users in a conversational and human-like manner. Your goal is to provide accurate, helpful, and engaging responses from the provided context: {context} while maintaining a natural tone. Follow these guidelines:
# 1. **Greetings:** If the user greets you (e.g., "Morning," "Hello," "Hi"), respond warmly and acknowledge the greeting. For example:
# - "😊 Good morning! How can I assist you today?"
# - "Hello! What can I do for you? πŸš€"
# 2. **Extract Information:** If the user asks for specific information, extract only the relevant details from the provided context: {context}.
# 3. **Human-like Interaction:** Respond in a warm, conversational tone. Use emojis occasionally to make the interaction more engaging (e.g., 😊, πŸš€).
# 4. **Stay Updated:** Acknowledge the current date and time to show you are aware of real-time updates.
# 5. **No Extra Content:** If no information matches the user's request, respond politely: "I don't have that information at the moment, but I'm happy to help with something else! 😊"
# 6. **Personalized Interaction:** Use the user's historical interactions (if available) to tailor your responses and make the conversation more personalized.
# 7. **Direct Data Only:** If the user requests specific data, provide only the requested information without additional explanations unless asked.
# Context: {context}
# User's Question: {question}
# Your Response:
# """)
template = ("""
You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
1. **Warm & Natural Interaction**
- If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them.
- Example responses:
- "😊 Good morning! How can I assist you today?"
- "Hello! What can I do for you? πŸš€"
2. **Precise Information Extraction**
- Provide only the relevant details from the given context: {context}.
- Do not generate extra content or assumptions beyond the provided information.
3. **Conversational & Engaging Tone**
- Keep responses friendly, natural, and engaging.
- Use occasional emojis (e.g., 😊, πŸš€) to make interactions more lively.
4. **Awareness of Real-Time Context**
- If necessary, acknowledge the current date and time to show awareness of real-world updates.
5. **Handling Missing Information**
- If no relevant information exists in the context, respond politely:
- "I don't have that information at the moment, but I'm happy to help with something else! 😊"
6. **Personalized Interaction**
- If user history is available, tailor responses based on their previous interactions for a more natural and engaging conversation.
7. **Direct, Concise Responses**
- If the user requests specific data, provide only the requested details without unnecessary explanations unless asked.
8. **Extracting Relevant Links**
- If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.
- Example response:
- "Here is the link you requested: [URL]"
**Context:** {context}
**User's Question:** {question}
**Your Response:**
""")
rag_prompt = PromptTemplate.from_template(template)
retriever = vectorstore.as_retriever()
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key )
rag_chain = (
{"context": retriever, "question": RunnablePassthrough()}
| rag_prompt
| llm
| StrOutputParser()
)
# Define the RAG memory stream function
def rag_memory_stream(message, history):
partial_text = ""
for new_text in rag_chain.stream(message): # Replace with actual streaming logic
partial_text += new_text
yield partial_text
# Title with emojis
title = "RRA Chatbot"
# Short description for the examples section
examples = [
" Can you help me with the tax rates on vehicle importation?",
" What is TIN deregistration? What about Tax account deactivation?",
"When do I receive my registration certificate?"
]
# Custom CSS for styling the interface
custom_css = """
body {
font-family: "Arial", serif;
}
.gradio-container {
font-family: "Times New Roman", serif;
}
.gr-button {
background-color: #007bff; /* Blue button */
color: white;
border: none;
border-radius: 5px;
font-size: 16px;
padding: 10px 20px;
cursor: pointer;
}
.gr-textbox:focus, .gr-button:focus {
outline: none; /* Remove outline focus for a cleaner look */
}
/* Custom CSS for the examples section */
.gr-examples {
font-size: 30px; /* Increase font size of examples */
background-color: #f9f9f9; /* Light background color */
border-radius: 30px; /* Rounded corners */
}
.gr-examples .example {
background-color: white; /* White background for each example */
cursor: pointer; /* Change cursor to pointer on hover */
transition: background-color 0.3s ease; /* Smooth hover effect */
}
.gr-examples .example:hover {
background-color: #f1f1f1; /* Light gray background on hover */
}
"""
# Create the Chat Interface
demo = gr.ChatInterface(
fn=rag_memory_stream,
title=title,
examples=examples, # Display the short description and example questions
fill_height=True,
theme="soft",
css=custom_css, # Apply the custom CSS
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True, inbrowser=True, debug=True)