aliss77777's picture
Upload folder using huggingface_hub
06cb2a3 verified
import os
import csv
import time
from datetime import datetime, timedelta, timezone
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import openai # Added for LLM Summarization
# Load environment variables (for API keys)
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o") # Default to gpt-4o if not set
if not OPENAI_API_KEY:
print("Warning: OPENAI_API_KEY not found in environment variables. Summarization will be skipped.")
# Or raise an error if summarization is critical:
# raise ValueError("OPENAI_API_KEY environment variable is required for summarization.")
TARGET_URL = "https://www.ninersnation.com/san-francisco-49ers-news"
OUTPUT_CSV_FILE = "team_news_articles.csv"
DAYS_TO_SCRAPE = 60 # Scrape articles from the past 60 days
REQUEST_DELAY = 1 # Delay in seconds between requests to be polite
# Add a flag to enable/disable summarization easily
ENABLE_SUMMARIZATION = True if OPENAI_API_KEY else False
def fetch_html(url):
"""Fetches HTML content from a URL with error handling."""
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) # Basic user-agent
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def parse_article_list(html_content):
"""Parses the main news page to find article links and dates."""
print("Parsing article list page...")
soup = BeautifulSoup(html_content, 'html.parser')
articles = []
# SBNation common structure: find compact entry boxes
# Note: Class names might change, may need adjustment if scraping fails.
article_elements = soup.find_all('div', class_='c-entry-box--compact')
if not article_elements:
# Fallback: Try another common pattern if the first fails
article_elements = soup.find_all('div', class_='p-entry-box')
print(f"Found {len(article_elements)} potential article elements.")
for elem in article_elements:
# Find the main link within the heading
heading = elem.find('h2')
link_tag = heading.find('a', href=True) if heading else None
# Find the time tag for publication date
time_tag = elem.find('time', datetime=True)
if link_tag and time_tag and link_tag['href']:
url = link_tag['href']
# Ensure the URL is absolute
if not url.startswith('http'):
# Attempt to join with base URL (requires knowing the base, careful with relative paths)
# For now, we'll rely on SBNation typically using absolute URLs or full paths
# from urllib.parse import urljoin
# base_url = "https://www.ninersnation.com"
# url = urljoin(base_url, url)
# Let's assume they are absolute for now based on typical SBNation structure
print(f"Warning: Found potentially relative URL: {url}. Skipping for now.")
continue # Skip potentially relative URLs
date_str = time_tag['datetime'] # e.g., "2024-05-20T10:00:00-07:00"
if url and date_str:
articles.append((url, date_str))
else:
print("Skipping element: Couldn't find link or time tag.") # Debugging
print(f"Extracted {len(articles)} articles with URL and date.")
return articles
def parse_article_details(html_content, url):
"""Parses an individual article page to extract details including raw content."""
print(f"Parsing article details for: {url}")
soup = BeautifulSoup(html_content, 'html.parser')
details = {
"title": None,
"content": None, # This will store the raw content for summarization
"publication_date": None,
"link_to_article": url,
"tags": []
}
# Extract Title (Usually the main H1)
title_tag = soup.find('h1') # Find the first H1
if title_tag:
details['title'] = title_tag.get_text(strip=True)
else:
print(f"Warning: Title tag (h1) not found for {url}")
# Extract Publication Date (Look for time tag in byline)
# SBNation often uses <span class="c-byline__item"><time ...></span>
byline_time_tag = soup.find('span', class_='c-byline__item')
time_tag = byline_time_tag.find('time', datetime=True) if byline_time_tag else None
if time_tag and time_tag.get('datetime'):
details['publication_date'] = time_tag['datetime']
else:
# Fallback: Search for any time tag with datetime attribute if specific class fails
time_tag = soup.find('time', datetime=True)
if time_tag and time_tag.get('datetime'):
details['publication_date'] = time_tag['datetime']
else:
print(f"Warning: Publication date tag (time[datetime]) not found for {url}")
# Extract Content (Paragraphs within the main content div)
content_div = soup.find('div', class_='c-entry-content')
if content_div:
paragraphs = content_div.find_all('p')
# Join non-empty paragraphs, ensuring None safety
# Store this raw content for potential summarization
details['content'] = '\n\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
else:
print(f"Warning: Content div (div.c-entry-content) not found for {url}")
# Extract Tags (Look for tags/labels, e.g., under "Filed under:")
# SBNation often uses a ul/div with class like 'c-entry-group-labels' or 'c-entry-tags'
tags_container = soup.find('ul', class_='m-tags__list') # A common SBNation tag structure
if tags_container:
tag_elements = tags_container.find_all('a') # Tags are usually links
details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)]))
else:
# Fallback: Look for another potential container like the one in the example text
filed_under_div = soup.find('div', class_='c-entry-group-labels') # Another possible class
if filed_under_div:
tag_elements = filed_under_div.find_all('a')
details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)]))
else:
# Specific structure from example text if needed ('Filed under:' section)
# This requires finding the specific structure around 'Filed under:'
# Could be more fragile, attempt simpler methods first.
print(f"Warning: Tags container not found using common classes for {url}")
# Example: Search based on text 'Filed under:' - less reliable
# filed_under_header = soup.find(lambda tag: tag.name == 'h2' and 'Filed under:' in tag.get_text())
# if filed_under_header:
# parent_or_sibling = filed_under_header.parent # Adjust based on actual structure
# tag_elements = parent_or_sibling.find_all('a') if parent_or_sibling else []
# details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements]))
# Basic validation - ensure essential fields were extracted for basic processing
# Content is needed for summarization but might be missing on some pages (e.g., galleries)
if not details['title'] or not details['publication_date']:
print(f"Failed to extract essential details (title or date) for {url}. Returning None.")
return None
# Content check specifically before returning - needed for summary
if not details['content']:
print(f"Warning: Missing content for {url}. Summary cannot be generated.")
return details
def is_within_timeframe(date_str, days):
"""Checks if a date string (ISO format) is within the specified number of days from now."""
if not date_str:
return False
try:
# Parse the ISO format date string, handling potential 'Z' for UTC
pub_date = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
# Ensure pub_date is offset-aware (has timezone info)
# If fromisoformat gives naive datetime, assume UTC (common practice for 'Z')
if pub_date.tzinfo is None or pub_date.tzinfo.utcoffset(pub_date) is None:
pub_date = pub_date.replace(tzinfo=timezone.utc) # Assume UTC if naive
# Get current time as an offset-aware datetime in UTC
now_utc = datetime.now(timezone.utc)
# Calculate the cutoff date
cutoff_date = now_utc - timedelta(days=days)
# Compare offset-aware datetimes
return pub_date >= cutoff_date
except ValueError as e:
print(f"Could not parse date: {date_str}. Error: {e}")
return False # Skip if date parsing fails
except Exception as e:
print(f"Unexpected error during date comparison for {date_str}: {e}")
return False
def generate_summary(article_content):
"""Generates a 3-4 sentence summary using OpenAI API."""
if not ENABLE_SUMMARIZATION or not article_content:
print("Skipping summary generation (disabled or no content).")
return "" # Return empty string if summarization skipped or no content
print("Generating summary...")
try:
client = openai.OpenAI(api_key=OPENAI_API_KEY)
# Simple prompt for summarization
prompt = f"""Please provide a concise 3-4 sentence summary of the following article content.
Focus on the key information and main points. Do not include any information not present in the text. :
---
{article_content}
---
Summary:"""
# Limit content length to avoid excessive token usage (adjust limit as needed)
max_content_length = 15000 # Approx limit, GPT-4o context window is large but be mindful of cost/speed
if len(prompt) > max_content_length:
print(f"Warning: Content too long ({len(article_content)} chars), truncating for summarization.")
# Truncate content intelligently if needed, here just slicing prompt
prompt = prompt[:max_content_length]
response = client.chat.completions.create(
model=OPENAI_MODEL,
messages=[
{"role": "system", "content": "You are an AI assistant tasked with summarizing news articles concisely."},
{"role": "user", "content": prompt}
],
temperature=0.5, # Adjust for desired creativity vs factuality
max_tokens=150 # Limit summary length
)
summary = response.choices[0].message.content.strip()
print("Summary generated successfully.")
return summary
except openai.APIError as e:
print(f"OpenAI API returned an API Error: {e}")
except openai.APIConnectionError as e:
print(f"Failed to connect to OpenAI API: {e}")
except openai.RateLimitError as e:
print(f"OpenAI API request exceeded rate limit: {e}")
except Exception as e:
print(f"An unexpected error occurred during summarization: {e}")
return "" # Return empty string on failure
def scrape_and_summarize_niners_nation():
"""Main function to scrape, parse, summarize, and return structured data."""
print("Starting Niners Nation scraping and summarization process...")
main_page_html = fetch_html(TARGET_URL)
if not main_page_html:
print("Failed to fetch the main news page. Exiting.")
return []
articles_on_page = parse_article_list(main_page_html)
scraped_and_summarized_data = []
now_utc = datetime.now(timezone.utc)
cutoff_datetime = now_utc - timedelta(days=DAYS_TO_SCRAPE)
print(f"Filtering articles published since {cutoff_datetime.strftime('%Y-%m-%d %H:%M:%S %Z')}")
processed_urls = set()
for url, date_str in articles_on_page:
if url in processed_urls:
continue
if not is_within_timeframe(date_str, DAYS_TO_SCRAPE):
continue
print(f"Fetching article: {url}")
article_html = fetch_html(url)
if article_html:
details = parse_article_details(article_html, url)
if details:
# Generate summary if content exists and summarization enabled
article_summary = "" # Initialize summary
if details.get('content'):
article_summary = generate_summary(details['content'])
else:
print(f"Skipping summary for {url} due to missing content.")
# Add the summary to the details dictionary
details['summary'] = article_summary
# Proceed to structure data (now including the summary)
structured_row = structure_data_for_csv_row(details) # Use a helper for single row
if structured_row:
scraped_and_summarized_data.append(structured_row)
processed_urls.add(url)
print(f"Successfully scraped and summarized: {details['title']}")
else:
print(f"Failed to structure data for {url}")
else:
print(f"Failed to parse essential details for article: {url}")
else:
print(f"Failed to fetch article page: {url}")
print(f"Waiting for {REQUEST_DELAY} second(s)...")
time.sleep(REQUEST_DELAY)
print(f"Scraping & Summarization finished. Collected {len(scraped_and_summarized_data)} articles.")
return scraped_and_summarized_data
def structure_data_for_csv_row(article_details):
"""Processes a single article's details into the final CSV structure."""
current_year = datetime.now().year
# Extract and parse publication date to get the year
season = current_year # Default to current year
pub_date_str = article_details.get("publication_date")
if pub_date_str:
try:
pub_date = datetime.fromisoformat(pub_date_str.replace('Z', '+00:00'))
season = pub_date.year
except ValueError:
print(f"Warning: Could not parse date '{pub_date_str}' for season. Using default {current_year}.")
# Get tags and format as topic string
tags = article_details.get("tags", [])
topic = ", ".join(tags) if tags else "General News"
# Build the dictionary for the CSV row
structured_row = {
"Team_name": "San Francisco 49ers",
"season": season,
"city": "San Francisco",
"conference": "NFC",
"division": "West",
"logo_url": "",
"summary": article_details.get("summary", ""), # Get the generated summary
"topic": topic,
"link_to_article": article_details.get("link_to_article", ""),
}
return structured_row
def write_to_csv(data, filename):
"""Writes the structured data to a CSV file."""
if not data:
print("No data to write to CSV.")
return
fieldnames = [
"Team_name", "season", "city", "conference", "division",
"logo_url", "summary", "topic", "link_to_article"
]
if not all(key in data[0] for key in fieldnames):
print(f"Error: Mismatch between defined fieldnames and data keys.")
print(f"Expected: {fieldnames}")
print(f"Got keys: {list(data[0].keys())}")
return
print(f"Writing {len(data)} rows to {filename}...")
try:
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
print(f"Successfully wrote {len(data)} rows to {filename}")
except IOError as e:
print(f"Error writing to CSV file {filename}: {e}")
except Exception as e:
print(f"An unexpected error occurred during CSV writing: {e}")
# --- Main Execution ---
if __name__ == "__main__":
# Call the main orchestrator function that includes summarization
processed_articles = scrape_and_summarize_niners_nation()
if processed_articles:
write_to_csv(processed_articles, OUTPUT_CSV_FILE)
else:
print("No articles were processed.")