aliss77777's picture
Upload folder using huggingface_hub
06cb2a3 verified
import requests
from bs4 import BeautifulSoup
import csv
import os
import time
import re
import json
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Constants
NFL_TEAMS_URL = "https://www.nfl.com/teams/"
OUTPUT_DIR = "team_logos"
CSV_OUTPUT = "nfl_team_logos.csv"
EXPECTED_TEAM_COUNT = 32
def ensure_output_dir(dir_path):
"""Ensure output directory exists"""
if not os.path.exists(dir_path):
os.makedirs(dir_path)
logger.info(f"Created directory: {dir_path}")
def download_image(url, file_path):
"""Download image from URL and save to file_path"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
response = requests.get(url, headers=headers, stream=True)
response.raise_for_status()
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return True
except Exception as e:
logger.error(f"Failed to download image from {url}: {e}")
return False
def get_team_logo_urls():
"""
Get team logo URLs directly from team pages.
Returns a dictionary mapping team names to their logo URLs.
"""
logger.info(f"Fetching team information from {NFL_TEAMS_URL}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
try:
response = requests.get(NFL_TEAMS_URL, headers=headers)
response.raise_for_status()
except Exception as e:
logger.error(f"Failed to fetch NFL teams page: {e}")
return {}
soup = BeautifulSoup(response.text, 'html.parser')
# Find all team links
team_links = []
for a_tag in soup.find_all('a', href=True):
if '/teams/' in a_tag['href'] and a_tag['href'].count('/') >= 3:
# This looks like a team-specific link
team_links.append(a_tag['href'])
# Get unique team URLs
team_urls = {}
for link in team_links:
# Extract team slug (e.g., 'cardinals', '49ers')
match = re.search(r'/teams/([a-z0-9-]+)/?$', link)
if match:
team_slug = match.group(1)
if team_slug not in team_urls:
full_url = f"https://www.nfl.com{link}" if not link.startswith('http') else link
team_urls[team_slug] = full_url
logger.info(f"Found {len(team_urls)} unique team URLs")
# Visit each team page to get the official logo
team_logos = {}
for slug, url in team_urls.items():
try:
logger.info(f"Visiting team page: {url}")
team_response = requests.get(url, headers=headers)
team_response.raise_for_status()
team_soup = BeautifulSoup(team_response.text, 'html.parser')
# Get team name from title
title_tag = team_soup.find('title')
if title_tag:
title_text = title_tag.text
team_name = title_text.split('|')[0].strip()
if not team_name:
team_name = slug.replace('-', ' ').title() # Fallback to slug
else:
team_name = slug.replace('-', ' ').title() # Fallback to slug
# Look for team logo in various places
logo_url = None
# Method 1: Look for logo in meta tags (most reliable)
og_image = team_soup.find('meta', property='og:image')
if og_image and og_image.get('content'):
logo_url = og_image.get('content')
# Method 2: Look for team logos in certain image tags or SVGs
if not logo_url:
team_header = team_soup.find('div', class_=lambda c: c and ('team-header' in c or 'logo' in c))
if team_header:
img = team_header.find('img')
if img and img.get('src'):
logo_url = img.get('src')
# Method 3: JavaScript data
if not logo_url:
scripts = team_soup.find_all('script')
for script in scripts:
if script.string and ('logo' in script.string.lower() or 'image' in script.string.lower()):
# Try to extract JSON data with logo information
json_matches = re.findall(r'({.*?"logo".*?})', script.string)
for match in json_matches:
try:
data = json.loads(match)
if 'logo' in data and isinstance(data['logo'], str):
logo_url = data['logo']
break
except:
continue
# Method 4: Fallback to a known pattern based on team abbreviation
if not logo_url and len(slug) > 2:
# Some teams have standardized logo URLs with abbreviations
team_abbr = slug[:2].upper() # Get first 2 chars as abbreviation
logo_url = f"https://static.www.nfl.com/t_headshot_desktop/f_auto/league/api/clubs/logos/{team_abbr}"
# If we found a logo, add it to our dictionary
if logo_url:
# If necessary, make the URL absolute
if not logo_url.startswith('http'):
logo_url = f"https://www.nfl.com{logo_url}" if logo_url.startswith('/') else f"https://www.nfl.com/{logo_url}"
team_logos[team_name] = logo_url
logger.info(f"Found logo for {team_name}: {logo_url}")
else:
logger.warning(f"Could not find logo URL for {team_name}")
# Be polite with rate limiting
time.sleep(1)
except Exception as e:
logger.error(f"Error processing team page {url}: {e}")
logger.info(f"Found logos for {len(team_logos)} teams")
return team_logos
def download_team_logos():
"""Download NFL team logos and save to CSV"""
logger.info("Starting NFL team logo download")
# Ensure output directory exists
ensure_output_dir(OUTPUT_DIR)
# Get team logo URLs from team pages
team_logos = get_team_logo_urls()
# Use a backup approach for any missing teams
if len(team_logos) < EXPECTED_TEAM_COUNT:
logger.warning(f"Only found {len(team_logos)} team logos from web scraping. Using ESPN API as backup.")
# We'll use ESPN's API to get team data including logos
try:
espn_url = "https://site.api.espn.com/apis/site/v2/sports/football/nfl/teams"
response = requests.get(espn_url)
response.raise_for_status()
espn_data = response.json()
if 'sports' in espn_data and len(espn_data['sports']) > 0:
if 'leagues' in espn_data['sports'][0] and len(espn_data['sports'][0]['leagues']) > 0:
if 'teams' in espn_data['sports'][0]['leagues'][0]:
for team_data in espn_data['sports'][0]['leagues'][0]['teams']:
team = team_data.get('team', {})
team_name = team.get('displayName')
if team_name and team_name not in team_logos:
logo_url = team.get('logos', [{}])[0].get('href')
if logo_url:
team_logos[team_name] = logo_url
logger.info(f"Added {team_name} logo from ESPN API: {logo_url}")
except Exception as e:
logger.error(f"Error fetching from ESPN API: {e}")
# If we still don't have enough teams, use a manually defined dictionary
if len(team_logos) < EXPECTED_TEAM_COUNT:
logger.warning(f"Still only have {len(team_logos)} teams. Adding manual definitions for missing teams.")
# Standard team names that should be present
standard_teams = [
"Arizona Cardinals", "Atlanta Falcons", "Baltimore Ravens", "Buffalo Bills",
"Carolina Panthers", "Chicago Bears", "Cincinnati Bengals", "Cleveland Browns",
"Dallas Cowboys", "Denver Broncos", "Detroit Lions", "Green Bay Packers",
"Houston Texans", "Indianapolis Colts", "Jacksonville Jaguars", "Kansas City Chiefs",
"Las Vegas Raiders", "Los Angeles Chargers", "Los Angeles Rams", "Miami Dolphins",
"Minnesota Vikings", "New England Patriots", "New Orleans Saints", "New York Giants",
"New York Jets", "Philadelphia Eagles", "Pittsburgh Steelers", "San Francisco 49ers",
"Seattle Seahawks", "Tampa Bay Buccaneers", "Tennessee Titans", "Washington Commanders"
]
# Manual dictionary of team logos (use correct ones from NFL's CDN)
manual_logos = {
"Arizona Cardinals": "https://static.www.nfl.com/image/private/f_auto/league/u9fltoslqdsyao8cpm0k",
"Atlanta Falcons": "https://static.www.nfl.com/image/private/f_auto/league/d8m7hzwsyzgg0smz7ifyj",
"Baltimore Ravens": "https://static.www.nfl.com/image/private/f_auto/league/ucsdijmddsqcj1i9tddd",
"Buffalo Bills": "https://static.www.nfl.com/image/private/f_auto/league/giphcy6ie9mxbnldntsf",
"Carolina Panthers": "https://static.www.nfl.com/image/private/f_auto/league/ervfzgrqdpnc7lh5gqwq",
"Chicago Bears": "https://static.www.nfl.com/image/private/f_auto/league/ra0poq2ivwyahbaq86d2",
"Cincinnati Bengals": "https://static.www.nfl.com/image/private/f_auto/league/bpx88i8nw4nnabuq0oob",
"Cleveland Browns": "https://static.www.nfl.com/image/private/f_auto/league/omlzo6n7dpxzbpwrqaak",
"Dallas Cowboys": "https://static.www.nfl.com/image/private/f_auto/league/dxibuyxbk0b9ua5ih9hn",
"Denver Broncos": "https://static.www.nfl.com/image/private/f_auto/league/t0p7m5cjdjy18rnzzqbx",
"Detroit Lions": "https://static.www.nfl.com/image/private/f_auto/league/dhfidtn8jrumakbawoxz",
"Green Bay Packers": "https://static.www.nfl.com/image/private/f_auto/league/q1l7xmkuuyrpdmnutkzf",
"Houston Texans": "https://static.www.nfl.com/image/private/f_auto/league/bpx88i8nw4nnabuq0oob",
"Indianapolis Colts": "https://static.www.nfl.com/image/private/f_auto/league/ketwqeuschqzjsllbid5",
"Jacksonville Jaguars": "https://static.www.nfl.com/image/private/f_auto/league/bwl1nuab0n2bhi8nxiar",
"Kansas City Chiefs": "https://static.www.nfl.com/image/private/f_auto/league/ujshjqvmnxce8m4obmvs",
"Las Vegas Raiders": "https://static.www.nfl.com/image/private/f_auto/league/gzcojbzcyjgubgyb6xf2",
"Los Angeles Chargers": "https://static.www.nfl.com/image/private/f_auto/league/dhfidtn8jrumakbawoxz",
"Los Angeles Rams": "https://static.www.nfl.com/image/private/f_auto/league/rjxoqpjirhjvvitffvwh",
"Miami Dolphins": "https://static.www.nfl.com/image/private/f_auto/league/lits6p8ycthy9to70bnt",
"Minnesota Vikings": "https://static.www.nfl.com/image/private/f_auto/league/teguylrnqqmfcwxvcmmz",
"New England Patriots": "https://static.www.nfl.com/image/private/f_auto/league/moyfxx3dq5pio4aiftnc",
"New Orleans Saints": "https://static.www.nfl.com/image/private/f_auto/league/grhjkahghuebpwzo6kxn",
"New York Giants": "https://static.www.nfl.com/image/private/f_auto/league/t6mhdmgizi6qhndh8b9p",
"New York Jets": "https://static.www.nfl.com/image/private/f_auto/league/ekijosiae96gektbo1lj",
"Philadelphia Eagles": "https://static.www.nfl.com/image/private/f_auto/league/puhrqgj71gobgmwb5g3p",
"Pittsburgh Steelers": "https://static.www.nfl.com/image/private/f_auto/league/xujik9a3j8hl6jjumu25",
"San Francisco 49ers": "https://static.www.nfl.com/image/private/f_auto/league/dxibuyxbk0b9ua5ih9hn",
"Seattle Seahawks": "https://static.www.nfl.com/image/private/f_auto/league/gcytzwpjdzbpwnwxincg",
"Tampa Bay Buccaneers": "https://static.www.nfl.com/image/private/f_auto/league/v8uqiualryypwqgvwcih",
"Tennessee Titans": "https://static.www.nfl.com/image/private/f_auto/league/pln44vuzugjgipyidsre",
"Washington Commanders": "https://static.www.nfl.com/image/private/f_auto/league/xymxwrxtyj9fhaegfwof"
}
# Fill in any missing teams with manual data
for team_name in standard_teams:
if team_name not in team_logos and team_name in manual_logos:
team_logos[team_name] = manual_logos[team_name]
logger.info(f"Added {team_name} logo from manual dictionary")
# Process and download team logos
results = []
for team_name, logo_url in team_logos.items():
# Create safe filename
safe_name = team_name.replace(' ', '_').lower()
file_extension = '.png' # Default to PNG
filename = f"{safe_name}{file_extension}"
local_path = os.path.join(OUTPUT_DIR, filename)
# Download the logo
logger.info(f"Downloading logo for {team_name} from {logo_url}")
download_success = download_image(logo_url, local_path)
if download_success:
results.append({
'team_name': team_name,
'logo_url': logo_url,
'local_path': local_path
})
logger.info(f"Successfully downloaded logo for {team_name}")
else:
logger.error(f"Failed to download logo for {team_name}")
# Add a small delay
time.sleep(0.5)
# Save to CSV
with open(CSV_OUTPUT, 'w', newline='', encoding='utf-8') as f:
fieldnames = ['team_name', 'logo_url', 'local_path']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results)
logger.info(f"Successfully saved {len(results)} team logos out of {len(team_logos)} teams.")
logger.info(f"CSV data saved to '{CSV_OUTPUT}'")
if len(results) < EXPECTED_TEAM_COUNT:
logger.warning(f"Only downloaded {len(results)} team logos, expected {EXPECTED_TEAM_COUNT}.")
else:
logger.info(f"SUCCESS! Downloaded all {EXPECTED_TEAM_COUNT} NFL team logos!")
return results
if __name__ == "__main__":
download_team_logos()