|
|
""" |
|
|
Example 2: Data Ingestion - PDF and Web Scraping |
|
|
|
|
|
This example demonstrates: |
|
|
- PDF document reading and processing |
|
|
- Web article extraction |
|
|
- News aggregation |
|
|
- Intelligence extraction from documents |
|
|
""" |
|
|
|
|
|
import sys |
|
|
sys.path.append('..') |
|
|
|
|
|
from geobot.data_ingestion.pdf_reader import PDFReader, PDFProcessor |
|
|
from geobot.data_ingestion.web_scraper import WebScraper, ArticleExtractor, NewsAggregator |
|
|
|
|
|
|
|
|
def demo_pdf_processing(): |
|
|
"""Demonstrate PDF processing capabilities.""" |
|
|
print("\n" + "=" * 80) |
|
|
print("PDF Processing Demo") |
|
|
print("=" * 80) |
|
|
|
|
|
|
|
|
processor = PDFProcessor() |
|
|
|
|
|
print("\nPDF processing capabilities:") |
|
|
print("- Text extraction from PDFs") |
|
|
print("- Table extraction") |
|
|
print("- Metadata extraction") |
|
|
print("- Entity recognition (countries, organizations)") |
|
|
print("- Keyword extraction") |
|
|
print("- Risk assessment") |
|
|
print("\nTo use: processor.process_document('path/to/document.pdf')") |
|
|
|
|
|
|
|
|
example_code = """ |
|
|
# Process a single PDF |
|
|
result = processor.process_document('intelligence_report.pdf') |
|
|
|
|
|
print(f"Title: {result['metadata'].get('title', 'Unknown')}") |
|
|
print(f"Pages: {result['num_pages']}") |
|
|
print(f"Keywords: {result['keywords']}") |
|
|
print(f"Risk Level: {result['intelligence']['risk_level']}") |
|
|
|
|
|
# Process multiple PDFs |
|
|
results = processor.batch_process('reports_directory/', '*.pdf') |
|
|
""" |
|
|
|
|
|
print("\nExample usage:") |
|
|
print(example_code) |
|
|
|
|
|
|
|
|
def demo_web_scraping(): |
|
|
"""Demonstrate web scraping capabilities.""" |
|
|
print("\n" + "=" * 80) |
|
|
print("Web Scraping Demo") |
|
|
print("=" * 80) |
|
|
|
|
|
|
|
|
extractor = ArticleExtractor() |
|
|
|
|
|
print("\nWeb scraping capabilities:") |
|
|
print("- Extract articles from URLs") |
|
|
print("- Clean HTML content") |
|
|
print("- Extract metadata (author, date, etc.)") |
|
|
print("- Multiple extraction methods (newspaper3k, trafilatura, BeautifulSoup)") |
|
|
|
|
|
|
|
|
example_url = "https://www.example.com/geopolitical-analysis" |
|
|
|
|
|
print(f"\nExample: Extracting article from {example_url}") |
|
|
print("(This is a demonstration - no actual web request is made)") |
|
|
|
|
|
example_code = """ |
|
|
# Extract article |
|
|
article = extractor.extract_article(url) |
|
|
|
|
|
print(f"Title: {article['title']}") |
|
|
print(f"Author: {article['authors']}") |
|
|
print(f"Published: {article['publish_date']}") |
|
|
print(f"Content length: {len(article['text'])} characters") |
|
|
|
|
|
# Extract multiple articles |
|
|
urls = ['url1', 'url2', 'url3'] |
|
|
articles = extractor.batch_extract(urls) |
|
|
""" |
|
|
|
|
|
print("\nExample usage:") |
|
|
print(example_code) |
|
|
|
|
|
|
|
|
def demo_news_aggregation(): |
|
|
"""Demonstrate news aggregation capabilities.""" |
|
|
print("\n" + "=" * 80) |
|
|
print("News Aggregation Demo") |
|
|
print("=" * 80) |
|
|
|
|
|
aggregator = NewsAggregator() |
|
|
|
|
|
print("\nNews aggregation capabilities:") |
|
|
print("- Aggregate from multiple sources") |
|
|
print("- RSS feed support") |
|
|
print("- Keyword filtering") |
|
|
print("- Trending topic detection") |
|
|
print("- Real-time monitoring") |
|
|
|
|
|
|
|
|
print("\nExample: Setting up news aggregation") |
|
|
|
|
|
example_code = """ |
|
|
# Add news sources |
|
|
aggregator.add_source( |
|
|
name='Reuters', |
|
|
url='https://www.reuters.com/news/world', |
|
|
source_type='rss' |
|
|
) |
|
|
|
|
|
aggregator.add_source( |
|
|
name='Al Jazeera', |
|
|
url='https://www.aljazeera.com/xml/rss/all.xml', |
|
|
source_type='rss' |
|
|
) |
|
|
|
|
|
# Fetch news with keywords |
|
|
keywords = ['sanctions', 'conflict', 'diplomacy', 'military'] |
|
|
articles = aggregator.fetch_news(keywords) |
|
|
|
|
|
print(f"Found {len(articles)} relevant articles") |
|
|
|
|
|
# Get trending topics |
|
|
topics = aggregator.get_trending_topics(articles, n_topics=10) |
|
|
print("Trending topics:", topics) |
|
|
|
|
|
# Monitor sources continuously |
|
|
def alert_callback(new_articles): |
|
|
print(f"ALERT: {len(new_articles)} new relevant articles found") |
|
|
for article in new_articles: |
|
|
print(f" - {article['title']}") |
|
|
|
|
|
# Monitor every hour |
|
|
aggregator.monitor_sources(keywords, callback=alert_callback, interval=3600) |
|
|
""" |
|
|
|
|
|
print(example_code) |
|
|
|
|
|
|
|
|
def demo_intelligence_extraction(): |
|
|
"""Demonstrate intelligence extraction from documents.""" |
|
|
print("\n" + "=" * 80) |
|
|
print("Intelligence Extraction Demo") |
|
|
print("=" * 80) |
|
|
|
|
|
print("\nIntelligence extraction capabilities:") |
|
|
print("- Country and organization detection") |
|
|
print("- Conflict indicator detection") |
|
|
print("- Risk level assessment") |
|
|
print("- Document classification") |
|
|
print("- Key phrase extraction") |
|
|
|
|
|
example_code = """ |
|
|
processor = PDFProcessor() |
|
|
|
|
|
# Extract intelligence from PDF |
|
|
intel = processor.extract_intelligence('report.pdf') |
|
|
|
|
|
print("Intelligence Summary:") |
|
|
print(f"Risk Level: {intel['intelligence']['risk_level']}") |
|
|
print(f"Countries mentioned: {intel['intelligence']['mentioned_countries']}") |
|
|
print(f"Conflict indicators: {intel['intelligence']['conflict_indicators']}") |
|
|
print(f"Key topics: {intel['intelligence']['key_topics']}") |
|
|
print(f"Document type: {intel['intelligence']['document_type']}") |
|
|
""" |
|
|
|
|
|
print("\nExample usage:") |
|
|
print(example_code) |
|
|
|
|
|
|
|
|
def main(): |
|
|
print("=" * 80) |
|
|
print("GeoBotv1 - Data Ingestion Examples") |
|
|
print("=" * 80) |
|
|
print("\nThis module demonstrates the data ingestion capabilities of GeoBotv1:") |
|
|
print("1. PDF document processing") |
|
|
print("2. Web scraping and article extraction") |
|
|
print("3. News aggregation from multiple sources") |
|
|
print("4. Intelligence extraction from documents") |
|
|
|
|
|
demo_pdf_processing() |
|
|
demo_web_scraping() |
|
|
demo_news_aggregation() |
|
|
demo_intelligence_extraction() |
|
|
|
|
|
print("\n" + "=" * 80) |
|
|
print("Data Ingestion Demo Complete") |
|
|
print("=" * 80) |
|
|
print("\nNote: Install required packages for full functionality:") |
|
|
print(" pip install pypdf pdfplumber beautifulsoup4 newspaper3k trafilatura") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|