Spaces:

aliss77777
/

IFX-sandbox

Runtime error

App Files Files Community

IFX-sandbox / data /april_11_multimedia_data_collect /team_news_scraper.py

aliss77777

Upload folder using huggingface_hub

06cb2a3 verified 7 months ago

raw

history blame contribute delete

16.5 kB

	import os
	import csv
	import time
	from datetime import datetime, timedelta, timezone
	import requests
	from bs4 import BeautifulSoup
	from dotenv import load_dotenv
	import openai # Added for LLM Summarization

	# Load environment variables (for API keys)
	load_dotenv()
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o") # Default to gpt-4o if not set

	if not OPENAI_API_KEY:
	print("Warning: OPENAI_API_KEY not found in environment variables. Summarization will be skipped.")
	# Or raise an error if summarization is critical:
	# raise ValueError("OPENAI_API_KEY environment variable is required for summarization.")

	TARGET_URL = "https://www.ninersnation.com/san-francisco-49ers-news"
	OUTPUT_CSV_FILE = "team_news_articles.csv"
	DAYS_TO_SCRAPE = 60 # Scrape articles from the past 60 days
	REQUEST_DELAY = 1 # Delay in seconds between requests to be polite

	# Add a flag to enable/disable summarization easily
	ENABLE_SUMMARIZATION = True if OPENAI_API_KEY else False

	def fetch_html(url):
	"""Fetches HTML content from a URL with error handling."""
	try:
	response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) # Basic user-agent
	response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
	return response.text
	except requests.exceptions.RequestException as e:
	print(f"Error fetching {url}: {e}")
	return None

	def parse_article_list(html_content):
	"""Parses the main news page to find article links and dates."""
	print("Parsing article list page...")
	soup = BeautifulSoup(html_content, 'html.parser')
	articles = []
	# SBNation common structure: find compact entry boxes
	# Note: Class names might change, may need adjustment if scraping fails.
	article_elements = soup.find_all('div', class_='c-entry-box--compact')
	if not article_elements:
	# Fallback: Try another common pattern if the first fails
	article_elements = soup.find_all('div', class_='p-entry-box')

	print(f"Found {len(article_elements)} potential article elements.")

	for elem in article_elements:
	# Find the main link within the heading
	heading = elem.find('h2')
	link_tag = heading.find('a', href=True) if heading else None

	# Find the time tag for publication date
	time_tag = elem.find('time', datetime=True)

	if link_tag and time_tag and link_tag['href']:
	url = link_tag['href']
	# Ensure the URL is absolute
	if not url.startswith('http'):
	# Attempt to join with base URL (requires knowing the base, careful with relative paths)
	# For now, we'll rely on SBNation typically using absolute URLs or full paths
	# from urllib.parse import urljoin
	# base_url = "https://www.ninersnation.com"
	# url = urljoin(base_url, url)
	# Let's assume they are absolute for now based on typical SBNation structure
	print(f"Warning: Found potentially relative URL: {url}. Skipping for now.")
	continue # Skip potentially relative URLs

	date_str = time_tag['datetime'] # e.g., "2024-05-20T10:00:00-07:00"
	if url and date_str:
	articles.append((url, date_str))
	else:
	print("Skipping element: Couldn't find link or time tag.") # Debugging

	print(f"Extracted {len(articles)} articles with URL and date.")
	return articles

	def parse_article_details(html_content, url):
	"""Parses an individual article page to extract details including raw content."""
	print(f"Parsing article details for: {url}")
	soup = BeautifulSoup(html_content, 'html.parser')

	details = {
	"title": None,
	"content": None, # This will store the raw content for summarization
	"publication_date": None,
	"link_to_article": url,
	"tags": []
	}

	# Extract Title (Usually the main H1)
	title_tag = soup.find('h1') # Find the first H1
	if title_tag:
	details['title'] = title_tag.get_text(strip=True)
	else:
	print(f"Warning: Title tag (h1) not found for {url}")

	# Extract Publication Date (Look for time tag in byline)
	# SBNation often uses <span class="c-byline__item"><time ...></span>
	byline_time_tag = soup.find('span', class_='c-byline__item')
	time_tag = byline_time_tag.find('time', datetime=True) if byline_time_tag else None
	if time_tag and time_tag.get('datetime'):
	details['publication_date'] = time_tag['datetime']
	else:
	# Fallback: Search for any time tag with datetime attribute if specific class fails
	time_tag = soup.find('time', datetime=True)
	if time_tag and time_tag.get('datetime'):
	details['publication_date'] = time_tag['datetime']
	else:
	print(f"Warning: Publication date tag (time[datetime]) not found for {url}")

	# Extract Content (Paragraphs within the main content div)
	content_div = soup.find('div', class_='c-entry-content')
	if content_div:
	paragraphs = content_div.find_all('p')
	# Join non-empty paragraphs, ensuring None safety
	# Store this raw content for potential summarization
	details['content'] = '\n\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
	else:
	print(f"Warning: Content div (div.c-entry-content) not found for {url}")

	# Extract Tags (Look for tags/labels, e.g., under "Filed under:")
	# SBNation often uses a ul/div with class like 'c-entry-group-labels' or 'c-entry-tags'
	tags_container = soup.find('ul', class_='m-tags__list') # A common SBNation tag structure
	if tags_container:
	tag_elements = tags_container.find_all('a') # Tags are usually links
	details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)]))
	else:
	# Fallback: Look for another potential container like the one in the example text
	filed_under_div = soup.find('div', class_='c-entry-group-labels') # Another possible class
	if filed_under_div:
	tag_elements = filed_under_div.find_all('a')
	details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)]))
	else:
	# Specific structure from example text if needed ('Filed under:' section)
	# This requires finding the specific structure around 'Filed under:'
	# Could be more fragile, attempt simpler methods first.
	print(f"Warning: Tags container not found using common classes for {url}")
	# Example: Search based on text 'Filed under:' - less reliable
	# filed_under_header = soup.find(lambda tag: tag.name == 'h2' and 'Filed under:' in tag.get_text())
	# if filed_under_header:
	# parent_or_sibling = filed_under_header.parent # Adjust based on actual structure
	# tag_elements = parent_or_sibling.find_all('a') if parent_or_sibling else []
	# details['tags'] = list(set([tag.get_text(strip=True) for tag in tag_elements]))

	# Basic validation - ensure essential fields were extracted for basic processing
	# Content is needed for summarization but might be missing on some pages (e.g., galleries)
	if not details['title'] or not details['publication_date']:
	print(f"Failed to extract essential details (title or date) for {url}. Returning None.")
	return None

	# Content check specifically before returning - needed for summary
	if not details['content']:
	print(f"Warning: Missing content for {url}. Summary cannot be generated.")

	return details

	def is_within_timeframe(date_str, days):
	"""Checks if a date string (ISO format) is within the specified number of days from now."""
	if not date_str:
	return False
	try:
	# Parse the ISO format date string, handling potential 'Z' for UTC
	pub_date = datetime.fromisoformat(date_str.replace('Z', '+00:00'))

	# Ensure pub_date is offset-aware (has timezone info)
	# If fromisoformat gives naive datetime, assume UTC (common practice for 'Z')
	if pub_date.tzinfo is None or pub_date.tzinfo.utcoffset(pub_date) is None:
	pub_date = pub_date.replace(tzinfo=timezone.utc) # Assume UTC if naive

	# Get current time as an offset-aware datetime in UTC
	now_utc = datetime.now(timezone.utc)

	# Calculate the cutoff date
	cutoff_date = now_utc - timedelta(days=days)

	# Compare offset-aware datetimes
	return pub_date >= cutoff_date
	except ValueError as e:
	print(f"Could not parse date: {date_str}. Error: {e}")
	return False # Skip if date parsing fails
	except Exception as e:
	print(f"Unexpected error during date comparison for {date_str}: {e}")
	return False

	def generate_summary(article_content):
	"""Generates a 3-4 sentence summary using OpenAI API."""
	if not ENABLE_SUMMARIZATION or not article_content:
	print("Skipping summary generation (disabled or no content).")
	return "" # Return empty string if summarization skipped or no content

	print("Generating summary...")
	try:
	client = openai.OpenAI(api_key=OPENAI_API_KEY)

	# Simple prompt for summarization
	prompt = f"""Please provide a concise 3-4 sentence summary of the following article content.
	Focus on the key information and main points. Do not include any information not present in the text. :

	---
	{article_content}
	---

	Summary:"""

	# Limit content length to avoid excessive token usage (adjust limit as needed)
	max_content_length = 15000 # Approx limit, GPT-4o context window is large but be mindful of cost/speed
	if len(prompt) > max_content_length:
	print(f"Warning: Content too long ({len(article_content)} chars), truncating for summarization.")
	# Truncate content intelligently if needed, here just slicing prompt
	prompt = prompt[:max_content_length]

	response = client.chat.completions.create(
	model=OPENAI_MODEL,
	messages=[
	{"role": "system", "content": "You are an AI assistant tasked with summarizing news articles concisely."},
	{"role": "user", "content": prompt}
	],
	temperature=0.5, # Adjust for desired creativity vs factuality
	max_tokens=150 # Limit summary length
	)

	summary = response.choices[0].message.content.strip()
	print("Summary generated successfully.")
	return summary

	except openai.APIError as e:
	print(f"OpenAI API returned an API Error: {e}")
	except openai.APIConnectionError as e:
	print(f"Failed to connect to OpenAI API: {e}")
	except openai.RateLimitError as e:
	print(f"OpenAI API request exceeded rate limit: {e}")
	except Exception as e:
	print(f"An unexpected error occurred during summarization: {e}")

	return "" # Return empty string on failure

	def scrape_and_summarize_niners_nation():
	"""Main function to scrape, parse, summarize, and return structured data."""
	print("Starting Niners Nation scraping and summarization process...")
	main_page_html = fetch_html(TARGET_URL)
	if not main_page_html:
	print("Failed to fetch the main news page. Exiting.")
	return []

	articles_on_page = parse_article_list(main_page_html)

	scraped_and_summarized_data = []
	now_utc = datetime.now(timezone.utc)
	cutoff_datetime = now_utc - timedelta(days=DAYS_TO_SCRAPE)
	print(f"Filtering articles published since {cutoff_datetime.strftime('%Y-%m-%d %H:%M:%S %Z')}")

	processed_urls = set()

	for url, date_str in articles_on_page:
	if url in processed_urls:
	continue

	if not is_within_timeframe(date_str, DAYS_TO_SCRAPE):
	continue

	print(f"Fetching article: {url}")
	article_html = fetch_html(url)
	if article_html:
	details = parse_article_details(article_html, url)
	if details:
	# Generate summary if content exists and summarization enabled
	article_summary = "" # Initialize summary
	if details.get('content'):
	article_summary = generate_summary(details['content'])
	else:
	print(f"Skipping summary for {url} due to missing content.")

	# Add the summary to the details dictionary
	details['summary'] = article_summary

	# Proceed to structure data (now including the summary)
	structured_row = structure_data_for_csv_row(details) # Use a helper for single row
	if structured_row:
	scraped_and_summarized_data.append(structured_row)
	processed_urls.add(url)
	print(f"Successfully scraped and summarized: {details['title']}")
	else:
	print(f"Failed to structure data for {url}")

	else:
	print(f"Failed to parse essential details for article: {url}")
	else:
	print(f"Failed to fetch article page: {url}")

	print(f"Waiting for {REQUEST_DELAY} second(s)...")
	time.sleep(REQUEST_DELAY)

	print(f"Scraping & Summarization finished. Collected {len(scraped_and_summarized_data)} articles.")
	return scraped_and_summarized_data

	def structure_data_for_csv_row(article_details):
	"""Processes a single article's details into the final CSV structure."""
	current_year = datetime.now().year

	# Extract and parse publication date to get the year
	season = current_year # Default to current year
	pub_date_str = article_details.get("publication_date")
	if pub_date_str:
	try:
	pub_date = datetime.fromisoformat(pub_date_str.replace('Z', '+00:00'))
	season = pub_date.year
	except ValueError:
	print(f"Warning: Could not parse date '{pub_date_str}' for season. Using default {current_year}.")

	# Get tags and format as topic string
	tags = article_details.get("tags", [])
	topic = ", ".join(tags) if tags else "General News"

	# Build the dictionary for the CSV row
	structured_row = {
	"Team_name": "San Francisco 49ers",
	"season": season,
	"city": "San Francisco",
	"conference": "NFC",
	"division": "West",
	"logo_url": "",
	"summary": article_details.get("summary", ""), # Get the generated summary
	"topic": topic,
	"link_to_article": article_details.get("link_to_article", ""),
	}
	return structured_row

	def write_to_csv(data, filename):
	"""Writes the structured data to a CSV file."""
	if not data:
	print("No data to write to CSV.")
	return

	fieldnames = [
	"Team_name", "season", "city", "conference", "division",
	"logo_url", "summary", "topic", "link_to_article"
	]

	if not all(key in data[0] for key in fieldnames):
	print(f"Error: Mismatch between defined fieldnames and data keys.")
	print(f"Expected: {fieldnames}")
	print(f"Got keys: {list(data[0].keys())}")
	return

	print(f"Writing {len(data)} rows to {filename}...")
	try:
	with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()
	writer.writerows(data)
	print(f"Successfully wrote {len(data)} rows to {filename}")
	except IOError as e:
	print(f"Error writing to CSV file {filename}: {e}")
	except Exception as e:
	print(f"An unexpected error occurred during CSV writing: {e}")

	# --- Main Execution ---
	if __name__ == "__main__":
	# Call the main orchestrator function that includes summarization
	processed_articles = scrape_and_summarize_niners_nation()

	if processed_articles:
	write_to_csv(processed_articles, OUTPUT_CSV_FILE)
	else:
	print("No articles were processed.")