File size: 6,109 Bytes
484e3bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""
Example 2: Data Ingestion - PDF and Web Scraping

This example demonstrates:
- PDF document reading and processing
- Web article extraction
- News aggregation
- Intelligence extraction from documents
"""

import sys
sys.path.append('..')

from geobot.data_ingestion.pdf_reader import PDFReader, PDFProcessor
from geobot.data_ingestion.web_scraper import WebScraper, ArticleExtractor, NewsAggregator


def demo_pdf_processing():
    """Demonstrate PDF processing capabilities."""
    print("\n" + "=" * 80)
    print("PDF Processing Demo")
    print("=" * 80)

    # Create PDF processor
    processor = PDFProcessor()

    print("\nPDF processing capabilities:")
    print("- Text extraction from PDFs")
    print("- Table extraction")
    print("- Metadata extraction")
    print("- Entity recognition (countries, organizations)")
    print("- Keyword extraction")
    print("- Risk assessment")
    print("\nTo use: processor.process_document('path/to/document.pdf')")

    # Example code structure
    example_code = """
    # Process a single PDF
    result = processor.process_document('intelligence_report.pdf')

    print(f"Title: {result['metadata'].get('title', 'Unknown')}")
    print(f"Pages: {result['num_pages']}")
    print(f"Keywords: {result['keywords']}")
    print(f"Risk Level: {result['intelligence']['risk_level']}")

    # Process multiple PDFs
    results = processor.batch_process('reports_directory/', '*.pdf')
    """

    print("\nExample usage:")
    print(example_code)


def demo_web_scraping():
    """Demonstrate web scraping capabilities."""
    print("\n" + "=" * 80)
    print("Web Scraping Demo")
    print("=" * 80)

    # Create article extractor
    extractor = ArticleExtractor()

    print("\nWeb scraping capabilities:")
    print("- Extract articles from URLs")
    print("- Clean HTML content")
    print("- Extract metadata (author, date, etc.)")
    print("- Multiple extraction methods (newspaper3k, trafilatura, BeautifulSoup)")

    # Example with a well-known news site (without actually fetching)
    example_url = "https://www.example.com/geopolitical-analysis"

    print(f"\nExample: Extracting article from {example_url}")
    print("(This is a demonstration - no actual web request is made)")

    example_code = """
    # Extract article
    article = extractor.extract_article(url)

    print(f"Title: {article['title']}")
    print(f"Author: {article['authors']}")
    print(f"Published: {article['publish_date']}")
    print(f"Content length: {len(article['text'])} characters")

    # Extract multiple articles
    urls = ['url1', 'url2', 'url3']
    articles = extractor.batch_extract(urls)
    """

    print("\nExample usage:")
    print(example_code)


def demo_news_aggregation():
    """Demonstrate news aggregation capabilities."""
    print("\n" + "=" * 80)
    print("News Aggregation Demo")
    print("=" * 80)

    aggregator = NewsAggregator()

    print("\nNews aggregation capabilities:")
    print("- Aggregate from multiple sources")
    print("- RSS feed support")
    print("- Keyword filtering")
    print("- Trending topic detection")
    print("- Real-time monitoring")

    # Example configuration
    print("\nExample: Setting up news aggregation")

    example_code = """
    # Add news sources
    aggregator.add_source(
        name='Reuters',
        url='https://www.reuters.com/news/world',
        source_type='rss'
    )

    aggregator.add_source(
        name='Al Jazeera',
        url='https://www.aljazeera.com/xml/rss/all.xml',
        source_type='rss'
    )

    # Fetch news with keywords
    keywords = ['sanctions', 'conflict', 'diplomacy', 'military']
    articles = aggregator.fetch_news(keywords)

    print(f"Found {len(articles)} relevant articles")

    # Get trending topics
    topics = aggregator.get_trending_topics(articles, n_topics=10)
    print("Trending topics:", topics)

    # Monitor sources continuously
    def alert_callback(new_articles):
        print(f"ALERT: {len(new_articles)} new relevant articles found")
        for article in new_articles:
            print(f"  - {article['title']}")

    # Monitor every hour
    aggregator.monitor_sources(keywords, callback=alert_callback, interval=3600)
    """

    print(example_code)


def demo_intelligence_extraction():
    """Demonstrate intelligence extraction from documents."""
    print("\n" + "=" * 80)
    print("Intelligence Extraction Demo")
    print("=" * 80)

    print("\nIntelligence extraction capabilities:")
    print("- Country and organization detection")
    print("- Conflict indicator detection")
    print("- Risk level assessment")
    print("- Document classification")
    print("- Key phrase extraction")

    example_code = """
    processor = PDFProcessor()

    # Extract intelligence from PDF
    intel = processor.extract_intelligence('report.pdf')

    print("Intelligence Summary:")
    print(f"Risk Level: {intel['intelligence']['risk_level']}")
    print(f"Countries mentioned: {intel['intelligence']['mentioned_countries']}")
    print(f"Conflict indicators: {intel['intelligence']['conflict_indicators']}")
    print(f"Key topics: {intel['intelligence']['key_topics']}")
    print(f"Document type: {intel['intelligence']['document_type']}")
    """

    print("\nExample usage:")
    print(example_code)


def main():
    print("=" * 80)
    print("GeoBotv1 - Data Ingestion Examples")
    print("=" * 80)
    print("\nThis module demonstrates the data ingestion capabilities of GeoBotv1:")
    print("1. PDF document processing")
    print("2. Web scraping and article extraction")
    print("3. News aggregation from multiple sources")
    print("4. Intelligence extraction from documents")

    demo_pdf_processing()
    demo_web_scraping()
    demo_news_aggregation()
    demo_intelligence_extraction()

    print("\n" + "=" * 80)
    print("Data Ingestion Demo Complete")
    print("=" * 80)
    print("\nNote: Install required packages for full functionality:")
    print("  pip install pypdf pdfplumber beautifulsoup4 newspaper3k trafilatura")


if __name__ == "__main__":
    main()