import os import csv import time from datetime import datetime, timedelta, timezone import requests from bs4 import BeautifulSoup from dotenv import load_dotenv import openai # Added for LLM Summarization # Load environment variables (for API keys) load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o") # Default to gpt-4o if not set if not OPENAI_API_KEY: print("Warning: OPENAI_API_KEY not found in environment variables. Summarization will be skipped.") # Or raise an error if summarization is critical: # raise ValueError("OPENAI_API_KEY environment variable is required for summarization.") TARGET_URL = "https://www.ninersnation.com/san-francisco-49ers-news" OUTPUT_CSV_FILE = "team_news_articles.csv" DAYS_TO_SCRAPE = 60 # Scrape articles from the past 60 days REQUEST_DELAY = 1 # Delay in seconds between requests to be polite # Add a flag to enable/disable summarization easily ENABLE_SUMMARIZATION = True if OPENAI_API_KEY else False def fetch_html(url): """Fetches HTML content from a URL with error handling.""" try: response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) # Basic user-agent response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) return response.text except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") return None def parse_article_list(html_content): """Parses the main news page to find article links and dates.""" print("Parsing article list page...") soup = BeautifulSoup(html_content, 'html.parser') articles = [] # SBNation common structure: find compact entry boxes # Note: Class names might change, may need adjustment if scraping fails. article_elements = soup.find_all('div', class_='c-entry-box--compact') if not article_elements: # Fallback: Try another common pattern if the first fails article_elements = soup.find_all('div', class_='p-entry-box') print(f"Found {len(article_elements)} potential article elements.") for elem in article_elements: # Find the main link within the heading heading = elem.find('h2') link_tag = heading.find('a', href=True) if heading else None # Find the time tag for publication date time_tag = elem.find('time', datetime=True) if link_tag and time_tag and link_tag['href']: url = link_tag['href'] # Ensure the URL is absolute if not url.startswith('http'): # Attempt to join with base URL (requires knowing the base, careful with relative paths) # For now, we'll rely on SBNation typically using absolute URLs or full paths # from urllib.parse import urljoin # base_url = "https://www.ninersnation.com" # url = urljoin(base_url, url) # Let's assume they are absolute for now based on typical SBNation structure print(f"Warning: Found potentially relative URL: {url}. Skipping for now.") continue # Skip potentially relative URLs date_str = time_tag['datetime'] # e.g., "2024-05-20T10:00:00-07:00" if url and date_str: articles.append((url, date_str)) else: print("Skipping element: Couldn't find link or time tag.") # Debugging print(f"Extracted {len(articles)} articles with URL and date.") return articles def parse_article_details(html_content, url): """Parses an individual article page to extract details including raw content.""" print(f"Parsing article details for: {url}") soup = BeautifulSoup(html_content, 'html.parser') details = { "title": None, "content": None, # This will store the raw content for summarization "publication_date": None, "link_to_article": url, "tags": [] } # Extract Title (Usually the main H1) title_tag = soup.find('h1') # Find the first H1 if title_tag: details['title'] = title_tag.get_text(strip=True) else: print(f"Warning: Title tag (h1) not found for {url}") # Extract Publication Date (Look for time tag in byline) # SBNation often uses byline_time_tag = soup.find('span', class_='c-byline__item') time_tag = byline_time_tag.find('time', datetime=True) if byline_time_tag else None if time_tag and time_tag.get('datetime'): details['publication_date'] = time_tag['datetime'] else: # Fallback: Search for any time tag with datetime attribute if specific class fails time_tag = soup.find('time', datetime=True) if time_tag and time_tag.get('datetime'): details['publication_date'] = time_tag['datetime'] else: print(f"Warning: Publication date tag (time[datetime]) not found for {url}") # Extract Content (Paragraphs within the main content div) content_div = soup.find('div', class_='c-entry-content') if content_div: paragraphs = content_div.find_all('p') # Join non-empty paragraphs, ensuring None safety # Store this raw content for potential summarization details['content'] = '\n\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) else: print(f"Warning: Content div (div.c-entry-content) not found for {url}") # Extract Tags (Look for tags/labels, e.g., under "Filed under:") # SBNation often uses a ul/div with class like 'c-entry-group-labels' or 'c-entry-tags' tags_container = soup.find('ul', class_='m-tags__list') # A common SBNation tag structure if tags_container: tag_elements = tags_container.find_all('a') # Tags are usually links details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)])) else: # Fallback: Look for another potential container like the one in the example text filed_under_div = soup.find('div', class_='c-entry-group-labels') # Another possible class if filed_under_div: tag_elements = filed_under_div.find_all('a') details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)])) else: # Specific structure from example text if needed ('Filed under:' section) # This requires finding the specific structure around 'Filed under:' # Could be more fragile, attempt simpler methods first. print(f"Warning: Tags container not found using common classes for {url}") # Example: Search based on text 'Filed under:' - less reliable # filed_under_header = soup.find(lambda tag: tag.name == 'h2' and 'Filed under:' in tag.get_text()) # if filed_under_header: # parent_or_sibling = filed_under_header.parent # Adjust based on actual structure # tag_elements = parent_or_sibling.find_all('a') if parent_or_sibling else [] # details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements])) # Basic validation - ensure essential fields were extracted for basic processing # Content is needed for summarization but might be missing on some pages (e.g., galleries) if not details['title'] or not details['publication_date']: print(f"Failed to extract essential details (title or date) for {url}. Returning None.") return None # Content check specifically before returning - needed for summary if not details['content']: print(f"Warning: Missing content for {url}. Summary cannot be generated.") return details def is_within_timeframe(date_str, days): """Checks if a date string (ISO format) is within the specified number of days from now.""" if not date_str: return False try: # Parse the ISO format date string, handling potential 'Z' for UTC pub_date = datetime.fromisoformat(date_str.replace('Z', '+00:00')) # Ensure pub_date is offset-aware (has timezone info) # If fromisoformat gives naive datetime, assume UTC (common practice for 'Z') if pub_date.tzinfo is None or pub_date.tzinfo.utcoffset(pub_date) is None: pub_date = pub_date.replace(tzinfo=timezone.utc) # Assume UTC if naive # Get current time as an offset-aware datetime in UTC now_utc = datetime.now(timezone.utc) # Calculate the cutoff date cutoff_date = now_utc - timedelta(days=days) # Compare offset-aware datetimes return pub_date >= cutoff_date except ValueError as e: print(f"Could not parse date: {date_str}. Error: {e}") return False # Skip if date parsing fails except Exception as e: print(f"Unexpected error during date comparison for {date_str}: {e}") return False def generate_summary(article_content): """Generates a 3-4 sentence summary using OpenAI API.""" if not ENABLE_SUMMARIZATION or not article_content: print("Skipping summary generation (disabled or no content).") return "" # Return empty string if summarization skipped or no content print("Generating summary...") try: client = openai.OpenAI(api_key=OPENAI_API_KEY) # Simple prompt for summarization prompt = f"""Please provide a concise 3-4 sentence summary of the following article content. Focus on the key information and main points. Do not include any information not present in the text. : --- {article_content} --- Summary:""" # Limit content length to avoid excessive token usage (adjust limit as needed) max_content_length = 15000 # Approx limit, GPT-4o context window is large but be mindful of cost/speed if len(prompt) > max_content_length: print(f"Warning: Content too long ({len(article_content)} chars), truncating for summarization.") # Truncate content intelligently if needed, here just slicing prompt prompt = prompt[:max_content_length] response = client.chat.completions.create( model=OPENAI_MODEL, messages=[ {"role": "system", "content": "You are an AI assistant tasked with summarizing news articles concisely."}, {"role": "user", "content": prompt} ], temperature=0.5, # Adjust for desired creativity vs factuality max_tokens=150 # Limit summary length ) summary = response.choices[0].message.content.strip() print("Summary generated successfully.") return summary except openai.APIError as e: print(f"OpenAI API returned an API Error: {e}") except openai.APIConnectionError as e: print(f"Failed to connect to OpenAI API: {e}") except openai.RateLimitError as e: print(f"OpenAI API request exceeded rate limit: {e}") except Exception as e: print(f"An unexpected error occurred during summarization: {e}") return "" # Return empty string on failure def scrape_and_summarize_niners_nation(): """Main function to scrape, parse, summarize, and return structured data.""" print("Starting Niners Nation scraping and summarization process...") main_page_html = fetch_html(TARGET_URL) if not main_page_html: print("Failed to fetch the main news page. Exiting.") return [] articles_on_page = parse_article_list(main_page_html) scraped_and_summarized_data = [] now_utc = datetime.now(timezone.utc) cutoff_datetime = now_utc - timedelta(days=DAYS_TO_SCRAPE) print(f"Filtering articles published since {cutoff_datetime.strftime('%Y-%m-%d %H:%M:%S %Z')}") processed_urls = set() for url, date_str in articles_on_page: if url in processed_urls: continue if not is_within_timeframe(date_str, DAYS_TO_SCRAPE): continue print(f"Fetching article: {url}") article_html = fetch_html(url) if article_html: details = parse_article_details(article_html, url) if details: # Generate summary if content exists and summarization enabled article_summary = "" # Initialize summary if details.get('content'): article_summary = generate_summary(details['content']) else: print(f"Skipping summary for {url} due to missing content.") # Add the summary to the details dictionary details['summary'] = article_summary # Proceed to structure data (now including the summary) structured_row = structure_data_for_csv_row(details) # Use a helper for single row if structured_row: scraped_and_summarized_data.append(structured_row) processed_urls.add(url) print(f"Successfully scraped and summarized: {details['title']}") else: print(f"Failed to structure data for {url}") else: print(f"Failed to parse essential details for article: {url}") else: print(f"Failed to fetch article page: {url}") print(f"Waiting for {REQUEST_DELAY} second(s)...") time.sleep(REQUEST_DELAY) print(f"Scraping & Summarization finished. Collected {len(scraped_and_summarized_data)} articles.") return scraped_and_summarized_data def structure_data_for_csv_row(article_details): """Processes a single article's details into the final CSV structure.""" current_year = datetime.now().year # Extract and parse publication date to get the year season = current_year # Default to current year pub_date_str = article_details.get("publication_date") if pub_date_str: try: pub_date = datetime.fromisoformat(pub_date_str.replace('Z', '+00:00')) season = pub_date.year except ValueError: print(f"Warning: Could not parse date '{pub_date_str}' for season. Using default {current_year}.") # Get tags and format as topic string tags = article_details.get("tags", []) topic = ", ".join(tags) if tags else "General News" # Build the dictionary for the CSV row structured_row = { "Team_name": "San Francisco 49ers", "season": season, "city": "San Francisco", "conference": "NFC", "division": "West", "logo_url": "", "summary": article_details.get("summary", ""), # Get the generated summary "topic": topic, "link_to_article": article_details.get("link_to_article", ""), } return structured_row def write_to_csv(data, filename): """Writes the structured data to a CSV file.""" if not data: print("No data to write to CSV.") return fieldnames = [ "Team_name", "season", "city", "conference", "division", "logo_url", "summary", "topic", "link_to_article" ] if not all(key in data[0] for key in fieldnames): print(f"Error: Mismatch between defined fieldnames and data keys.") print(f"Expected: {fieldnames}") print(f"Got keys: {list(data[0].keys())}") return print(f"Writing {len(data)} rows to {filename}...") try: with open(filename, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(data) print(f"Successfully wrote {len(data)} rows to {filename}") except IOError as e: print(f"Error writing to CSV file {filename}: {e}") except Exception as e: print(f"An unexpected error occurred during CSV writing: {e}") # --- Main Execution --- if __name__ == "__main__": # Call the main orchestrator function that includes summarization processed_articles = scrape_and_summarize_niners_nation() if processed_articles: write_to_csv(processed_articles, OUTPUT_CSV_FILE) else: print("No articles were processed.")