GeoBot-Forecasting-Framework / examples /02_data_ingestion.py
clarkkitchen22's picture
Initial GeoBot Forecasting Framework commit
484e3bc
"""
Example 2: Data Ingestion - PDF and Web Scraping
This example demonstrates:
- PDF document reading and processing
- Web article extraction
- News aggregation
- Intelligence extraction from documents
"""
import sys
sys.path.append('..')
from geobot.data_ingestion.pdf_reader import PDFReader, PDFProcessor
from geobot.data_ingestion.web_scraper import WebScraper, ArticleExtractor, NewsAggregator
def demo_pdf_processing():
"""Demonstrate PDF processing capabilities."""
print("\n" + "=" * 80)
print("PDF Processing Demo")
print("=" * 80)
# Create PDF processor
processor = PDFProcessor()
print("\nPDF processing capabilities:")
print("- Text extraction from PDFs")
print("- Table extraction")
print("- Metadata extraction")
print("- Entity recognition (countries, organizations)")
print("- Keyword extraction")
print("- Risk assessment")
print("\nTo use: processor.process_document('path/to/document.pdf')")
# Example code structure
example_code = """
# Process a single PDF
result = processor.process_document('intelligence_report.pdf')
print(f"Title: {result['metadata'].get('title', 'Unknown')}")
print(f"Pages: {result['num_pages']}")
print(f"Keywords: {result['keywords']}")
print(f"Risk Level: {result['intelligence']['risk_level']}")
# Process multiple PDFs
results = processor.batch_process('reports_directory/', '*.pdf')
"""
print("\nExample usage:")
print(example_code)
def demo_web_scraping():
"""Demonstrate web scraping capabilities."""
print("\n" + "=" * 80)
print("Web Scraping Demo")
print("=" * 80)
# Create article extractor
extractor = ArticleExtractor()
print("\nWeb scraping capabilities:")
print("- Extract articles from URLs")
print("- Clean HTML content")
print("- Extract metadata (author, date, etc.)")
print("- Multiple extraction methods (newspaper3k, trafilatura, BeautifulSoup)")
# Example with a well-known news site (without actually fetching)
example_url = "https://www.example.com/geopolitical-analysis"
print(f"\nExample: Extracting article from {example_url}")
print("(This is a demonstration - no actual web request is made)")
example_code = """
# Extract article
article = extractor.extract_article(url)
print(f"Title: {article['title']}")
print(f"Author: {article['authors']}")
print(f"Published: {article['publish_date']}")
print(f"Content length: {len(article['text'])} characters")
# Extract multiple articles
urls = ['url1', 'url2', 'url3']
articles = extractor.batch_extract(urls)
"""
print("\nExample usage:")
print(example_code)
def demo_news_aggregation():
"""Demonstrate news aggregation capabilities."""
print("\n" + "=" * 80)
print("News Aggregation Demo")
print("=" * 80)
aggregator = NewsAggregator()
print("\nNews aggregation capabilities:")
print("- Aggregate from multiple sources")
print("- RSS feed support")
print("- Keyword filtering")
print("- Trending topic detection")
print("- Real-time monitoring")
# Example configuration
print("\nExample: Setting up news aggregation")
example_code = """
# Add news sources
aggregator.add_source(
name='Reuters',
url='https://www.reuters.com/news/world',
source_type='rss'
)
aggregator.add_source(
name='Al Jazeera',
url='https://www.aljazeera.com/xml/rss/all.xml',
source_type='rss'
)
# Fetch news with keywords
keywords = ['sanctions', 'conflict', 'diplomacy', 'military']
articles = aggregator.fetch_news(keywords)
print(f"Found {len(articles)} relevant articles")
# Get trending topics
topics = aggregator.get_trending_topics(articles, n_topics=10)
print("Trending topics:", topics)
# Monitor sources continuously
def alert_callback(new_articles):
print(f"ALERT: {len(new_articles)} new relevant articles found")
for article in new_articles:
print(f" - {article['title']}")
# Monitor every hour
aggregator.monitor_sources(keywords, callback=alert_callback, interval=3600)
"""
print(example_code)
def demo_intelligence_extraction():
"""Demonstrate intelligence extraction from documents."""
print("\n" + "=" * 80)
print("Intelligence Extraction Demo")
print("=" * 80)
print("\nIntelligence extraction capabilities:")
print("- Country and organization detection")
print("- Conflict indicator detection")
print("- Risk level assessment")
print("- Document classification")
print("- Key phrase extraction")
example_code = """
processor = PDFProcessor()
# Extract intelligence from PDF
intel = processor.extract_intelligence('report.pdf')
print("Intelligence Summary:")
print(f"Risk Level: {intel['intelligence']['risk_level']}")
print(f"Countries mentioned: {intel['intelligence']['mentioned_countries']}")
print(f"Conflict indicators: {intel['intelligence']['conflict_indicators']}")
print(f"Key topics: {intel['intelligence']['key_topics']}")
print(f"Document type: {intel['intelligence']['document_type']}")
"""
print("\nExample usage:")
print(example_code)
def main():
print("=" * 80)
print("GeoBotv1 - Data Ingestion Examples")
print("=" * 80)
print("\nThis module demonstrates the data ingestion capabilities of GeoBotv1:")
print("1. PDF document processing")
print("2. Web scraping and article extraction")
print("3. News aggregation from multiple sources")
print("4. Intelligence extraction from documents")
demo_pdf_processing()
demo_web_scraping()
demo_news_aggregation()
demo_intelligence_extraction()
print("\n" + "=" * 80)
print("Data Ingestion Demo Complete")
print("=" * 80)
print("\nNote: Install required packages for full functionality:")
print(" pip install pypdf pdfplumber beautifulsoup4 newspaper3k trafilatura")
if __name__ == "__main__":
main()