Spaces:
Runtime error
Runtime error
| import os | |
| import csv | |
| import time | |
| from datetime import datetime, timedelta, timezone | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from dotenv import load_dotenv | |
| import openai # Added for LLM Summarization | |
| # Load environment variables (for API keys) | |
| load_dotenv() | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o") # Default to gpt-4o if not set | |
| if not OPENAI_API_KEY: | |
| print("Warning: OPENAI_API_KEY not found in environment variables. Summarization will be skipped.") | |
| # Or raise an error if summarization is critical: | |
| # raise ValueError("OPENAI_API_KEY environment variable is required for summarization.") | |
| TARGET_URL = "https://www.ninersnation.com/san-francisco-49ers-news" | |
| OUTPUT_CSV_FILE = "team_news_articles.csv" | |
| DAYS_TO_SCRAPE = 60 # Scrape articles from the past 60 days | |
| REQUEST_DELAY = 1 # Delay in seconds between requests to be polite | |
| # Add a flag to enable/disable summarization easily | |
| ENABLE_SUMMARIZATION = True if OPENAI_API_KEY else False | |
| def fetch_html(url): | |
| """Fetches HTML content from a URL with error handling.""" | |
| try: | |
| response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) # Basic user-agent | |
| response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching {url}: {e}") | |
| return None | |
| def parse_article_list(html_content): | |
| """Parses the main news page to find article links and dates.""" | |
| print("Parsing article list page...") | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| articles = [] | |
| # SBNation common structure: find compact entry boxes | |
| # Note: Class names might change, may need adjustment if scraping fails. | |
| article_elements = soup.find_all('div', class_='c-entry-box--compact') | |
| if not article_elements: | |
| # Fallback: Try another common pattern if the first fails | |
| article_elements = soup.find_all('div', class_='p-entry-box') | |
| print(f"Found {len(article_elements)} potential article elements.") | |
| for elem in article_elements: | |
| # Find the main link within the heading | |
| heading = elem.find('h2') | |
| link_tag = heading.find('a', href=True) if heading else None | |
| # Find the time tag for publication date | |
| time_tag = elem.find('time', datetime=True) | |
| if link_tag and time_tag and link_tag['href']: | |
| url = link_tag['href'] | |
| # Ensure the URL is absolute | |
| if not url.startswith('http'): | |
| # Attempt to join with base URL (requires knowing the base, careful with relative paths) | |
| # For now, we'll rely on SBNation typically using absolute URLs or full paths | |
| # from urllib.parse import urljoin | |
| # base_url = "https://www.ninersnation.com" | |
| # url = urljoin(base_url, url) | |
| # Let's assume they are absolute for now based on typical SBNation structure | |
| print(f"Warning: Found potentially relative URL: {url}. Skipping for now.") | |
| continue # Skip potentially relative URLs | |
| date_str = time_tag['datetime'] # e.g., "2024-05-20T10:00:00-07:00" | |
| if url and date_str: | |
| articles.append((url, date_str)) | |
| else: | |
| print("Skipping element: Couldn't find link or time tag.") # Debugging | |
| print(f"Extracted {len(articles)} articles with URL and date.") | |
| return articles | |
| def parse_article_details(html_content, url): | |
| """Parses an individual article page to extract details including raw content.""" | |
| print(f"Parsing article details for: {url}") | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| details = { | |
| "title": None, | |
| "content": None, # This will store the raw content for summarization | |
| "publication_date": None, | |
| "link_to_article": url, | |
| "tags": [] | |
| } | |
| # Extract Title (Usually the main H1) | |
| title_tag = soup.find('h1') # Find the first H1 | |
| if title_tag: | |
| details['title'] = title_tag.get_text(strip=True) | |
| else: | |
| print(f"Warning: Title tag (h1) not found for {url}") | |
| # Extract Publication Date (Look for time tag in byline) | |
| # SBNation often uses <span class="c-byline__item"><time ...></span> | |
| byline_time_tag = soup.find('span', class_='c-byline__item') | |
| time_tag = byline_time_tag.find('time', datetime=True) if byline_time_tag else None | |
| if time_tag and time_tag.get('datetime'): | |
| details['publication_date'] = time_tag['datetime'] | |
| else: | |
| # Fallback: Search for any time tag with datetime attribute if specific class fails | |
| time_tag = soup.find('time', datetime=True) | |
| if time_tag and time_tag.get('datetime'): | |
| details['publication_date'] = time_tag['datetime'] | |
| else: | |
| print(f"Warning: Publication date tag (time[datetime]) not found for {url}") | |
| # Extract Content (Paragraphs within the main content div) | |
| content_div = soup.find('div', class_='c-entry-content') | |
| if content_div: | |
| paragraphs = content_div.find_all('p') | |
| # Join non-empty paragraphs, ensuring None safety | |
| # Store this raw content for potential summarization | |
| details['content'] = '\n\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) | |
| else: | |
| print(f"Warning: Content div (div.c-entry-content) not found for {url}") | |
| # Extract Tags (Look for tags/labels, e.g., under "Filed under:") | |
| # SBNation often uses a ul/div with class like 'c-entry-group-labels' or 'c-entry-tags' | |
| tags_container = soup.find('ul', class_='m-tags__list') # A common SBNation tag structure | |
| if tags_container: | |
| tag_elements = tags_container.find_all('a') # Tags are usually links | |
| details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)])) | |
| else: | |
| # Fallback: Look for another potential container like the one in the example text | |
| filed_under_div = soup.find('div', class_='c-entry-group-labels') # Another possible class | |
| if filed_under_div: | |
| tag_elements = filed_under_div.find_all('a') | |
| details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)])) | |
| else: | |
| # Specific structure from example text if needed ('Filed under:' section) | |
| # This requires finding the specific structure around 'Filed under:' | |
| # Could be more fragile, attempt simpler methods first. | |
| print(f"Warning: Tags container not found using common classes for {url}") | |
| # Example: Search based on text 'Filed under:' - less reliable | |
| # filed_under_header = soup.find(lambda tag: tag.name == 'h2' and 'Filed under:' in tag.get_text()) | |
| # if filed_under_header: | |
| # parent_or_sibling = filed_under_header.parent # Adjust based on actual structure | |
| # tag_elements = parent_or_sibling.find_all('a') if parent_or_sibling else [] | |
| # details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements])) | |
| # Basic validation - ensure essential fields were extracted for basic processing | |
| # Content is needed for summarization but might be missing on some pages (e.g., galleries) | |
| if not details['title'] or not details['publication_date']: | |
| print(f"Failed to extract essential details (title or date) for {url}. Returning None.") | |
| return None | |
| # Content check specifically before returning - needed for summary | |
| if not details['content']: | |
| print(f"Warning: Missing content for {url}. Summary cannot be generated.") | |
| return details | |
| def is_within_timeframe(date_str, days): | |
| """Checks if a date string (ISO format) is within the specified number of days from now.""" | |
| if not date_str: | |
| return False | |
| try: | |
| # Parse the ISO format date string, handling potential 'Z' for UTC | |
| pub_date = datetime.fromisoformat(date_str.replace('Z', '+00:00')) | |
| # Ensure pub_date is offset-aware (has timezone info) | |
| # If fromisoformat gives naive datetime, assume UTC (common practice for 'Z') | |
| if pub_date.tzinfo is None or pub_date.tzinfo.utcoffset(pub_date) is None: | |
| pub_date = pub_date.replace(tzinfo=timezone.utc) # Assume UTC if naive | |
| # Get current time as an offset-aware datetime in UTC | |
| now_utc = datetime.now(timezone.utc) | |
| # Calculate the cutoff date | |
| cutoff_date = now_utc - timedelta(days=days) | |
| # Compare offset-aware datetimes | |
| return pub_date >= cutoff_date | |
| except ValueError as e: | |
| print(f"Could not parse date: {date_str}. Error: {e}") | |
| return False # Skip if date parsing fails | |
| except Exception as e: | |
| print(f"Unexpected error during date comparison for {date_str}: {e}") | |
| return False | |
| def generate_summary(article_content): | |
| """Generates a 3-4 sentence summary using OpenAI API.""" | |
| if not ENABLE_SUMMARIZATION or not article_content: | |
| print("Skipping summary generation (disabled or no content).") | |
| return "" # Return empty string if summarization skipped or no content | |
| print("Generating summary...") | |
| try: | |
| client = openai.OpenAI(api_key=OPENAI_API_KEY) | |
| # Simple prompt for summarization | |
| prompt = f"""Please provide a concise 3-4 sentence summary of the following article content. | |
| Focus on the key information and main points. Do not include any information not present in the text. : | |
| --- | |
| {article_content} | |
| --- | |
| Summary:""" | |
| # Limit content length to avoid excessive token usage (adjust limit as needed) | |
| max_content_length = 15000 # Approx limit, GPT-4o context window is large but be mindful of cost/speed | |
| if len(prompt) > max_content_length: | |
| print(f"Warning: Content too long ({len(article_content)} chars), truncating for summarization.") | |
| # Truncate content intelligently if needed, here just slicing prompt | |
| prompt = prompt[:max_content_length] | |
| response = client.chat.completions.create( | |
| model=OPENAI_MODEL, | |
| messages=[ | |
| {"role": "system", "content": "You are an AI assistant tasked with summarizing news articles concisely."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.5, # Adjust for desired creativity vs factuality | |
| max_tokens=150 # Limit summary length | |
| ) | |
| summary = response.choices[0].message.content.strip() | |
| print("Summary generated successfully.") | |
| return summary | |
| except openai.APIError as e: | |
| print(f"OpenAI API returned an API Error: {e}") | |
| except openai.APIConnectionError as e: | |
| print(f"Failed to connect to OpenAI API: {e}") | |
| except openai.RateLimitError as e: | |
| print(f"OpenAI API request exceeded rate limit: {e}") | |
| except Exception as e: | |
| print(f"An unexpected error occurred during summarization: {e}") | |
| return "" # Return empty string on failure | |
| def scrape_and_summarize_niners_nation(): | |
| """Main function to scrape, parse, summarize, and return structured data.""" | |
| print("Starting Niners Nation scraping and summarization process...") | |
| main_page_html = fetch_html(TARGET_URL) | |
| if not main_page_html: | |
| print("Failed to fetch the main news page. Exiting.") | |
| return [] | |
| articles_on_page = parse_article_list(main_page_html) | |
| scraped_and_summarized_data = [] | |
| now_utc = datetime.now(timezone.utc) | |
| cutoff_datetime = now_utc - timedelta(days=DAYS_TO_SCRAPE) | |
| print(f"Filtering articles published since {cutoff_datetime.strftime('%Y-%m-%d %H:%M:%S %Z')}") | |
| processed_urls = set() | |
| for url, date_str in articles_on_page: | |
| if url in processed_urls: | |
| continue | |
| if not is_within_timeframe(date_str, DAYS_TO_SCRAPE): | |
| continue | |
| print(f"Fetching article: {url}") | |
| article_html = fetch_html(url) | |
| if article_html: | |
| details = parse_article_details(article_html, url) | |
| if details: | |
| # Generate summary if content exists and summarization enabled | |
| article_summary = "" # Initialize summary | |
| if details.get('content'): | |
| article_summary = generate_summary(details['content']) | |
| else: | |
| print(f"Skipping summary for {url} due to missing content.") | |
| # Add the summary to the details dictionary | |
| details['summary'] = article_summary | |
| # Proceed to structure data (now including the summary) | |
| structured_row = structure_data_for_csv_row(details) # Use a helper for single row | |
| if structured_row: | |
| scraped_and_summarized_data.append(structured_row) | |
| processed_urls.add(url) | |
| print(f"Successfully scraped and summarized: {details['title']}") | |
| else: | |
| print(f"Failed to structure data for {url}") | |
| else: | |
| print(f"Failed to parse essential details for article: {url}") | |
| else: | |
| print(f"Failed to fetch article page: {url}") | |
| print(f"Waiting for {REQUEST_DELAY} second(s)...") | |
| time.sleep(REQUEST_DELAY) | |
| print(f"Scraping & Summarization finished. Collected {len(scraped_and_summarized_data)} articles.") | |
| return scraped_and_summarized_data | |
| def structure_data_for_csv_row(article_details): | |
| """Processes a single article's details into the final CSV structure.""" | |
| current_year = datetime.now().year | |
| # Extract and parse publication date to get the year | |
| season = current_year # Default to current year | |
| pub_date_str = article_details.get("publication_date") | |
| if pub_date_str: | |
| try: | |
| pub_date = datetime.fromisoformat(pub_date_str.replace('Z', '+00:00')) | |
| season = pub_date.year | |
| except ValueError: | |
| print(f"Warning: Could not parse date '{pub_date_str}' for season. Using default {current_year}.") | |
| # Get tags and format as topic string | |
| tags = article_details.get("tags", []) | |
| topic = ", ".join(tags) if tags else "General News" | |
| # Build the dictionary for the CSV row | |
| structured_row = { | |
| "Team_name": "San Francisco 49ers", | |
| "season": season, | |
| "city": "San Francisco", | |
| "conference": "NFC", | |
| "division": "West", | |
| "logo_url": "", | |
| "summary": article_details.get("summary", ""), # Get the generated summary | |
| "topic": topic, | |
| "link_to_article": article_details.get("link_to_article", ""), | |
| } | |
| return structured_row | |
| def write_to_csv(data, filename): | |
| """Writes the structured data to a CSV file.""" | |
| if not data: | |
| print("No data to write to CSV.") | |
| return | |
| fieldnames = [ | |
| "Team_name", "season", "city", "conference", "division", | |
| "logo_url", "summary", "topic", "link_to_article" | |
| ] | |
| if not all(key in data[0] for key in fieldnames): | |
| print(f"Error: Mismatch between defined fieldnames and data keys.") | |
| print(f"Expected: {fieldnames}") | |
| print(f"Got keys: {list(data[0].keys())}") | |
| return | |
| print(f"Writing {len(data)} rows to {filename}...") | |
| try: | |
| with open(filename, 'w', newline='', encoding='utf-8') as csvfile: | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| writer.writeheader() | |
| writer.writerows(data) | |
| print(f"Successfully wrote {len(data)} rows to {filename}") | |
| except IOError as e: | |
| print(f"Error writing to CSV file {filename}: {e}") | |
| except Exception as e: | |
| print(f"An unexpected error occurred during CSV writing: {e}") | |
| # --- Main Execution --- | |
| if __name__ == "__main__": | |
| # Call the main orchestrator function that includes summarization | |
| processed_articles = scrape_and_summarize_niners_nation() | |
| if processed_articles: | |
| write_to_csv(processed_articles, OUTPUT_CSV_FILE) | |
| else: | |
| print("No articles were processed.") |