import streamlit as st from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline, AutoModelForQuestionAnswering from sentence_transformers import SentenceTransformer import fitz # PyMuPDF import os # Load the models summarization_model_name = 'facebook/bart-large-cnn' tokenizer = AutoTokenizer.from_pretrained(summarization_model_name) summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name) qa_model_name = 'distilbert-base-uncased-distilled-squad' qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name) qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name) qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_tokenizer) # Function to extract text from a PDF file def extract_text_from_pdf(file): doc = fitz.open(file) text = "" for page in doc: text += page.get_text() return text # Function to summarize document def summarize_document(document): inputs = tokenizer(document, return_tensors='pt', max_length=1024, truncation=True) summary_ids = summarization_model.generate(inputs['input_ids'], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True) return tokenizer.decode(summary_ids[0], skip_special_tokens=True) # Streamlit app st.title("PDF Summarizer and Q&A") st.write("Upload a PDF file to get a summary and ask questions about the content.") uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file is not None: # Save the uploaded file to a temporary location with open("temp.pdf", "wb") as f: f.write(uploaded_file.getbuffer()) # Extract text from the PDF document_text = extract_text_from_pdf("temp.pdf") # Display the extracted text st.write("Extracted Text:") st.write(document_text) if st.button("Summarize"): with st.spinner('Summarizing...'): summary = summarize_document(document_text) st.write("**Summary:**") st.write(summary) question = st.text_input("Ask a question about the document") if st.button("Get Answer"): if question: with st.spinner('Generating answer...'): answer = qa_pipeline({'question': question, 'context': document_text}) st.write("**Answer:**") st.write(answer['answer']) else: st.write("Please enter a question.") # Remove temporary file after use if os.path.exists("temp.pdf"): os.remove("temp.pdf")