import requests import json from bs4 import BeautifulSoup def get_top_articles(url): try: response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') articles = [] for article in soup.select('a[href*="/articleshow/"]')[:10]: title = article.get_text(strip=True) link = article['href'] if not link.startswith("http"): link = "https://timesofindia.indiatimes.com" + link articles.append({"title": title, "link": link}) return articles except requests.exceptions.RequestException as e: return {"error": str(e)} def extract_article_content(url): try: response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') title = soup.find('h1') article_title = title.get_text(strip=True) if title else "No title found" # Find the div with data-article="1" article_body = soup.find('div', {'data-articlebody': "1"}) if article_body: paragraphs = [p.get_text(strip=True) for p in article_body.find_all('div') if len(p.get_text(strip=True)) > 20] content = "\n".join(paragraphs) else: content = "No content found" return {"title": article_title, "content": content, "link": url} except requests.exceptions.RequestException as e: return {"title": "Error", "content": f"Error fetching content: {e}", "link": url} if __name__ == "__main__": url = "https://timesofindia.indiatimes.com/topic/Google" print(f"Extracting top 10 articles from: {url}\n") articles = get_top_articles(url) if "error" in articles: print("Error:", articles["error"]) else: all_articles = [] for idx, article in enumerate(articles, start=1): print(f"Extracting content for article {idx}: {article['title']}\n Link: {article['link']}\n") article_data = extract_article_content(article['link']) print(f"Heading: {article_data['title']}\n Link: {article_data['link']}\n") print(f"Content:\n{article_data['content']}\n") all_articles.append(article_data) with open("articles.json", "w", encoding="utf-8") as f: json.dump(all_articles, f, ensure_ascii=False, indent=4) print("All articles saved to articles.json")