Spaces:
Running
Running
| """Browse a webpage and summarize it using the LLM model""" | |
| from __future__ import annotations | |
| from urllib.parse import urljoin, urlparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from requests import Response | |
| from requests.compat import urljoin | |
| from autogpt.config import Config | |
| from autogpt.memory import get_memory | |
| from autogpt.processing.html import extract_hyperlinks, format_hyperlinks | |
| CFG = Config() | |
| memory = get_memory(CFG) | |
| session = requests.Session() | |
| session.headers.update({"User-Agent": CFG.user_agent}) | |
| def is_valid_url(url: str) -> bool: | |
| """Check if the URL is valid | |
| Args: | |
| url (str): The URL to check | |
| Returns: | |
| bool: True if the URL is valid, False otherwise | |
| """ | |
| try: | |
| result = urlparse(url) | |
| return all([result.scheme, result.netloc]) | |
| except ValueError: | |
| return False | |
| def sanitize_url(url: str) -> str: | |
| """Sanitize the URL | |
| Args: | |
| url (str): The URL to sanitize | |
| Returns: | |
| str: The sanitized URL | |
| """ | |
| return urljoin(url, urlparse(url).path) | |
| def check_local_file_access(url: str) -> bool: | |
| """Check if the URL is a local file | |
| Args: | |
| url (str): The URL to check | |
| Returns: | |
| bool: True if the URL is a local file, False otherwise | |
| """ | |
| local_prefixes = [ | |
| "file:///", | |
| "file://localhost/", | |
| "file://localhost", | |
| "http://localhost", | |
| "http://localhost/", | |
| "https://localhost", | |
| "https://localhost/", | |
| "http://2130706433", | |
| "http://2130706433/", | |
| "https://2130706433", | |
| "https://2130706433/", | |
| "http://127.0.0.1/", | |
| "http://127.0.0.1", | |
| "https://127.0.0.1/", | |
| "https://127.0.0.1", | |
| "https://0.0.0.0/", | |
| "https://0.0.0.0", | |
| "http://0.0.0.0/", | |
| "http://0.0.0.0", | |
| "http://0000", | |
| "http://0000/", | |
| "https://0000", | |
| "https://0000/", | |
| ] | |
| return any(url.startswith(prefix) for prefix in local_prefixes) | |
| def get_response( | |
| url: str, timeout: int = 10 | |
| ) -> tuple[None, str] | tuple[Response, None]: | |
| """Get the response from a URL | |
| Args: | |
| url (str): The URL to get the response from | |
| timeout (int): The timeout for the HTTP request | |
| Returns: | |
| tuple[None, str] | tuple[Response, None]: The response and error message | |
| Raises: | |
| ValueError: If the URL is invalid | |
| requests.exceptions.RequestException: If the HTTP request fails | |
| """ | |
| try: | |
| # Restrict access to local files | |
| if check_local_file_access(url): | |
| raise ValueError("Access to local files is restricted") | |
| # Most basic check if the URL is valid: | |
| if not url.startswith("http://") and not url.startswith("https://"): | |
| raise ValueError("Invalid URL format") | |
| sanitized_url = sanitize_url(url) | |
| response = session.get(sanitized_url, timeout=timeout) | |
| # Check if the response contains an HTTP error | |
| if response.status_code >= 400: | |
| return None, f"Error: HTTP {str(response.status_code)} error" | |
| return response, None | |
| except ValueError as ve: | |
| # Handle invalid URL format | |
| return None, f"Error: {str(ve)}" | |
| except requests.exceptions.RequestException as re: | |
| # Handle exceptions related to the HTTP request | |
| # (e.g., connection errors, timeouts, etc.) | |
| return None, f"Error: {str(re)}" | |
| def scrape_text(url: str) -> str: | |
| """Scrape text from a webpage | |
| Args: | |
| url (str): The URL to scrape text from | |
| Returns: | |
| str: The scraped text | |
| """ | |
| response, error_message = get_response(url) | |
| if error_message: | |
| return error_message | |
| if not response: | |
| return "Error: Could not get response" | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| for script in soup(["script", "style"]): | |
| script.extract() | |
| text = soup.get_text() | |
| lines = (line.strip() for line in text.splitlines()) | |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
| text = "\n".join(chunk for chunk in chunks if chunk) | |
| return text | |
| def scrape_links(url: str) -> str | list[str]: | |
| """Scrape links from a webpage | |
| Args: | |
| url (str): The URL to scrape links from | |
| Returns: | |
| str | list[str]: The scraped links | |
| """ | |
| response, error_message = get_response(url) | |
| if error_message: | |
| return error_message | |
| if not response: | |
| return "Error: Could not get response" | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| for script in soup(["script", "style"]): | |
| script.extract() | |
| hyperlinks = extract_hyperlinks(soup, url) | |
| return format_hyperlinks(hyperlinks) | |
| def create_message(chunk, question): | |
| """Create a message for the user to summarize a chunk of text""" | |
| return { | |
| "role": "user", | |
| "content": f'"""{chunk}""" Using the above text, answer the following' | |
| f' question: "{question}" -- if the question cannot be answered using the' | |
| " text, summarize the text.", | |
| } | |