Spaces:
Sleeping
Sleeping
darisdzakwanhoesien2
commited on
Commit
·
9437058
1
Parent(s):
19902b0
Stage 5 Testing
Browse files
app.py
CHANGED
|
@@ -1,46 +1,34 @@
|
|
| 1 |
import re
|
| 2 |
from src.ontology_adapter import ESGOntologyAdapter
|
| 3 |
import gradio as gr
|
| 4 |
-
from collections import Counter
|
| 5 |
|
| 6 |
adapter = ESGOntologyAdapter("ontology/esg_ontology.owl")
|
| 7 |
|
| 8 |
def detect_sections(text):
|
| 9 |
"""Detects section titles and content from text."""
|
| 10 |
-
# Regex to find section titles (e.g., "Introduction", "1. Introduction", "Results and Discussion")
|
| 11 |
-
# This regex looks for lines that are not heavily indented and likely represent titles.
|
| 12 |
-
# It splits the text by lines that look like titles.
|
| 13 |
pattern = r'^\s*(?:\d+\.\s*)?([A-Z][a-zA-Z\s&]+)\s*$'
|
| 14 |
|
| 15 |
sections = []
|
| 16 |
|
| 17 |
-
# Find all potential titles
|
| 18 |
potential_titles = re.findall(pattern, text, re.MULTILINE)
|
| 19 |
|
| 20 |
-
# Split text by these titles
|
| 21 |
split_text = re.split(pattern, text, flags=re.MULTILINE)
|
| 22 |
|
| 23 |
content = split_text[0]
|
| 24 |
if content.strip():
|
| 25 |
sections.append({"title": "Introduction", "content": content.strip()})
|
| 26 |
|
| 27 |
-
# The regex split gives us [content_before_first_title, title1, content1, title2, content2, ...]
|
| 28 |
it = iter(split_text[1:])
|
| 29 |
for title, content in zip(it, it):
|
| 30 |
sections.append({"title": title.strip(), "content": content.strip()})
|
| 31 |
|
| 32 |
return sections
|
| 33 |
|
| 34 |
-
def
|
|
|
|
| 35 |
detected_sections = detect_sections(text)
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
pillar_sentiments = {
|
| 39 |
-
"Environmental": [],
|
| 40 |
-
"Social": [],
|
| 41 |
-
"Governance": [],
|
| 42 |
-
"Unknown": []
|
| 43 |
-
}
|
| 44 |
|
| 45 |
for section in detected_sections:
|
| 46 |
title = section['title']
|
|
@@ -49,65 +37,78 @@ def analyze_text(text):
|
|
| 49 |
if not content.strip():
|
| 50 |
continue
|
| 51 |
|
| 52 |
-
# Determine weight based on section title
|
| 53 |
weight = 1.0
|
| 54 |
if any(keyword in title.lower() for keyword in ["results", "performance"]):
|
| 55 |
-
weight = 1.5
|
| 56 |
|
| 57 |
mapping = adapter.map_term(content)
|
| 58 |
-
section_snippet = content.strip().replace("\n", " ")[:120]
|
| 59 |
|
| 60 |
if mapping['matches']:
|
| 61 |
-
top_match = mapping['matches'][0]
|
| 62 |
-
pillar = top_match['pillar']
|
| 63 |
sentiment = mapping['sentiment']
|
| 64 |
-
|
| 65 |
-
if pillar != "Unknown":
|
| 66 |
-
# Apply weight to sentiment score if possible (conceptual)
|
| 67 |
-
# For now, we just note it. The ontology adapter would need to support weights.
|
| 68 |
-
pillar_sentiments[pillar].append(sentiment)
|
| 69 |
-
|
| 70 |
-
predicted_aspects = []
|
| 71 |
for match in mapping['matches']:
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
results.append(f"🟢 Section: **{title}**\n{section_snippet}\n→ Predicted Aspects: {', '.join(predicted_aspects)}\n→ Sentiment: {sentiment.title()}")
|
| 82 |
-
else:
|
| 83 |
-
results.append(f"🟢 Section: **{title}**\n{section_snippet}\n→ No ESG aspects identified")
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
iface = gr.Interface(
|
| 106 |
-
fn=
|
| 107 |
-
inputs=
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
| 111 |
)
|
| 112 |
|
| 113 |
iface.launch()
|
|
|
|
| 1 |
import re
|
| 2 |
from src.ontology_adapter import ESGOntologyAdapter
|
| 3 |
import gradio as gr
|
| 4 |
+
from collections import Counter, defaultdict
|
| 5 |
|
| 6 |
adapter = ESGOntologyAdapter("ontology/esg_ontology.owl")
|
| 7 |
|
| 8 |
def detect_sections(text):
|
| 9 |
"""Detects section titles and content from text."""
|
|
|
|
|
|
|
|
|
|
| 10 |
pattern = r'^\s*(?:\d+\.\s*)?([A-Z][a-zA-Z\s&]+)\s*$'
|
| 11 |
|
| 12 |
sections = []
|
| 13 |
|
|
|
|
| 14 |
potential_titles = re.findall(pattern, text, re.MULTILINE)
|
| 15 |
|
|
|
|
| 16 |
split_text = re.split(pattern, text, flags=re.MULTILINE)
|
| 17 |
|
| 18 |
content = split_text[0]
|
| 19 |
if content.strip():
|
| 20 |
sections.append({"title": "Introduction", "content": content.strip()})
|
| 21 |
|
|
|
|
| 22 |
it = iter(split_text[1:])
|
| 23 |
for title, content in zip(it, it):
|
| 24 |
sections.append({"title": title.strip(), "content": content.strip()})
|
| 25 |
|
| 26 |
return sections
|
| 27 |
|
| 28 |
+
def analyze_single_document(text):
|
| 29 |
+
"""Analyzes a single document and returns aspect-level sentiment."""
|
| 30 |
detected_sections = detect_sections(text)
|
| 31 |
+
aspect_sentiments = defaultdict(list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
for section in detected_sections:
|
| 34 |
title = section['title']
|
|
|
|
| 37 |
if not content.strip():
|
| 38 |
continue
|
| 39 |
|
|
|
|
| 40 |
weight = 1.0
|
| 41 |
if any(keyword in title.lower() for keyword in ["results", "performance"]):
|
| 42 |
+
weight = 1.5
|
| 43 |
|
| 44 |
mapping = adapter.map_term(content)
|
|
|
|
| 45 |
|
| 46 |
if mapping['matches']:
|
|
|
|
|
|
|
| 47 |
sentiment = mapping['sentiment']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
for match in mapping['matches']:
|
| 49 |
+
aspect = match['mapped_to']
|
| 50 |
+
aspect_sentiments[aspect].append(sentiment)
|
| 51 |
+
|
| 52 |
+
# Aggregate sentiments for each aspect (e.g., by taking the most common one)
|
| 53 |
+
aggregated_sentiments = {}
|
| 54 |
+
for aspect, sentiments in aspect_sentiments.items():
|
| 55 |
+
if sentiments:
|
| 56 |
+
aggregated_sentiments[aspect] = Counter(sentiments).most_common(1)[0][0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
return aggregated_sentiments
|
| 59 |
+
|
| 60 |
+
def compare_documents(text1, text2):
|
| 61 |
+
"""Compares two documents for sentiment consistency."""
|
| 62 |
+
|
| 63 |
+
# Analyze both documents
|
| 64 |
+
sentiments1 = analyze_single_document(text1)
|
| 65 |
+
sentiments2 = analyze_single_document(text2)
|
| 66 |
+
|
| 67 |
+
# --- Generate formatted output for each document ---
|
| 68 |
+
report1 = "--- Document 1 Analysis ---\n"
|
| 69 |
+
for aspect, sentiment in sentiments1.items():
|
| 70 |
+
report1 += f"- **{aspect.replace('_', ' ').title()}**: {sentiment.title()}\n"
|
| 71 |
|
| 72 |
+
report2 = "\n--- Document 2 Analysis ---\n"
|
| 73 |
+
for aspect, sentiment in sentiments2.items():
|
| 74 |
+
report2 += f"- **{aspect.replace('_', ' ').title()}**: {sentiment.title()}\n"
|
| 75 |
+
|
| 76 |
+
# --- Cross-Document Consistency Analysis ---
|
| 77 |
+
consistency_report = "\n--- Cross-Document Consistency Analysis ---\n"
|
| 78 |
+
all_aspects = sorted(list(set(sentiments1.keys()) | set(sentiments2.keys())))
|
| 79 |
+
|
| 80 |
+
found_drift = False
|
| 81 |
+
for aspect in all_aspects:
|
| 82 |
+
sentiment1 = sentiments1.get(aspect)
|
| 83 |
+
sentiment2 = sentiments2.get(aspect)
|
| 84 |
|
| 85 |
+
aspect_name = aspect.replace('_', ' ').title()
|
| 86 |
+
|
| 87 |
+
if sentiment1 and sentiment2 and sentiment1 != sentiment2:
|
| 88 |
+
consistency_report += f"🟡 **Sentiment Drift Detected for '{aspect_name}'**\n"
|
| 89 |
+
consistency_report += f" - Document 1: {sentiment1.title()}\n"
|
| 90 |
+
consistency_report += f" - Document 2: {sentiment2.title()}\n\n"
|
| 91 |
+
found_drift = True
|
| 92 |
+
elif sentiment1 and not sentiment2:
|
| 93 |
+
consistency_report += f"⚪️ **'{aspect_name}'** only found in Document 1 (Sentiment: {sentiment1.title()})\n\n"
|
| 94 |
+
elif not sentiment1 and sentiment2:
|
| 95 |
+
consistency_report += f"⚪️ **'{aspect_name}'** only found in Document 2 (Sentiment: {sentiment2.title()})\n\n"
|
| 96 |
|
| 97 |
+
if not found_drift:
|
| 98 |
+
consistency_report += "✅ No sentiment contradictions detected between the two documents for common aspects.\n"
|
| 99 |
+
|
| 100 |
+
return report1 + report2 + consistency_report
|
| 101 |
|
| 102 |
|
| 103 |
iface = gr.Interface(
|
| 104 |
+
fn=compare_documents,
|
| 105 |
+
inputs=[
|
| 106 |
+
gr.Textbox(label="Input ESG Report Text 1 (e.g., 2022 Report)", lines=15, placeholder="Paste the first report here..."),
|
| 107 |
+
gr.Textbox(label="Input ESG Report Text 2 (e.g., 2023 Report)", lines=15, placeholder="Paste the second report here...")
|
| 108 |
+
],
|
| 109 |
+
outputs=gr.Textbox(label="Cross-Document Consistency Analysis", lines=40),
|
| 110 |
+
title="ESG Cross-Document Sentiment Consistency Analysis",
|
| 111 |
+
description="Compares two ESG reports to detect sentiment drift or contradictions for the same aspects over time."
|
| 112 |
)
|
| 113 |
|
| 114 |
iface.launch()
|