Spaces:

darisdzakwanhoesien
/

esgdata

Sleeping

App Files Files Community

darisdzakwanhoesien2 commited on Nov 1

Commit

8cd5464

1 Parent(s): ef22374

Stage 10 — Interpretability Dashboard & Bias Analysis

Browse files

Files changed (2) hide show

app.py +159 -95
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ from collections import Counter, defaultdict
 from rdflib import Graph, Literal, Namespace, URIRef
 from rdflib.namespace import RDF, RDFS
 from keybert import KeyBERT
 # --- Model Configuration ---
 # To use a fine-tuned model, change this path to the directory where your model is saved.
@@ -53,10 +55,12 @@ def detect_sections(text):
     return sections
-def analyze_single_document(text):
-    """Analyzes a single document and returns aspect-level sentiment."""
     detected_sections = detect_sections(text)
     aspect_sentiments = defaultdict(list)
     for section in detected_sections:
         title = section['title']
@@ -65,9 +69,13 @@ def analyze_single_document(text):
         if not content.strip():
             continue
-        weight = 1.0
-        if any(keyword in title.lower() for keyword in ["results", "performance"]):
-            weight = 1.5
         mapping = adapter.map_term(content)
@@ -75,148 +83,204 @@ def analyze_single_document(text):
             sentiment = mapping['sentiment']
             for match in mapping['matches']:
                 aspect = match['mapped_to']
                 aspect_sentiments[aspect].append(sentiment)
-    # Aggregate sentiments for each aspect (e.g., by taking the most common one)
     aggregated_sentiments = {}
     for aspect, sentiments in aspect_sentiments.items():
         if sentiments:
             aggregated_sentiments[aspect] = Counter(sentiments).most_common(1)[0][0]
-    return aggregated_sentiments
 def discover_new_aspects(text, existing_aspects):
     """Discovers new potential ESG aspects from text using KeyBERT."""
-    # Clean up text for better keyword extraction
     text = text.replace('\n', ' ')
-    # Extract keywords/keyphrases
     keywords = kw_model.extract_keywords(
-        text,
-        keyphrase_ngram_range=(1, 3),  # Consider phrases up to 3 words
-        stop_words='english',
-        use_mmr=True,  # Use Maximal Marginal Relevance to diversify results
-        diversity=0.7,
-        top_n=10  # Get top 10 candidates
     )
-    # Filter out keywords that are too similar to existing aspects
     suggested_aspects = []
     existing_aspect_labels = {aspect.replace('_', ' ') for aspect in existing_aspects}
     for keyword, score in keywords:
-        # Simple check to see if the keyword is already in our ontology
-        # A more advanced check would use semantic similarity.
         if keyword.lower() not in existing_aspect_labels and len(keyword) > 5:
-             suggested_aspects.append(f"**{keyword.title()}** (Score: {score:.2f})")
-    return suggested_aspects
 def generate_knowledge_graph(sentiments1, sentiments2):
     """Generates an RDF knowledge graph from sentiment analysis results."""
     g = Graph()
-    g.bind("esg", ESG)
-    g.bind("doc", DOC)
-    g.bind("rdf", RDF)
-    g.bind("rdfs", RDFS)
-    # Add triples for Document 1
     doc1_uri = DOC['report_1']
-    g.add((doc1_uri, RDF.type, ESG.Document))
-    g.add((doc1_uri, RDFS.label, Literal("Document 1")))
     for aspect, sentiment in sentiments1.items():
         aspect_uri = ESG[aspect]
-        g.add((doc1_uri, ESG.hasAspect, aspect_uri))
-        g.add((aspect_uri, RDF.type, ESG.Aspect))
         g.add((aspect_uri, RDFS.label, Literal(aspect.replace('_', ' ').title())))
         g.add((aspect_uri, ESG.hasSentiment, Literal(sentiment)))
-    # Add triples for Document 2
     doc2_uri = DOC['report_2']
-    g.add((doc2_uri, RDF.type, ESG.Document))
-    g.add((doc2_uri, RDFS.label, Literal("Document 2")))
     for aspect, sentiment in sentiments2.items():
         aspect_uri = ESG[aspect]
-        g.add((doc2_uri, ESG.hasAspect, aspect_uri))
-        g.add((aspect_uri, RDF.type, ESG.Aspect))
         g.add((aspect_uri, RDFS.label, Literal(aspect.replace('_', ' ').title())))
         g.add((aspect_uri, ESG.hasSentiment, Literal(sentiment)))
-    # Save graph to a file
     output_path = "esg_knowledge_graph.ttl"
     g.serialize(destination=output_path, format='turtle')
     return output_path
-def compare_documents(text1, text2):
-    """Compares two documents for sentiment consistency and generates a knowledge graph."""
-    # Analyze both documents
-    sentiments1 = analyze_single_document(text1)
-    sentiments2 = analyze_single_document(text2)
-    # Generate and save the knowledge graph
-    kg_file_path = generate_knowledge_graph(sentiments1, sentiments2)
-    # --- Generate formatted output for each document ---
-    report1 = "--- Document 1 Analysis ---\n"
-    for aspect, sentiment in sentiments1.items():
-        report1 += f"- **{aspect.replace('_', ' ').title()}**: {sentiment.title()}\n"
-    report2 = "\n--- Document 2 Analysis ---\n"
-    for aspect, sentiment in sentiments2.items():
-        report2 += f"- **{aspect.replace('_', ' ').title()}**: {sentiment.title()}\n"
-    # --- Cross-Document Consistency Analysis ---
-    consistency_report = "\n--- Cross-Document Consistency Analysis ---\n"
-    all_aspects = sorted(list(set(sentiments1.keys()) | set(sentiments2.keys())))
     found_drift = False
     for aspect in all_aspects:
-        sentiment1 = sentiments1.get(aspect)
-        sentiment2 = sentiments2.get(aspect)
-        aspect_name = aspect.replace('_', ' ').title()
-        if sentiment1 and sentiment2 and sentiment1 != sentiment2:
-            consistency_report += f"🟡 **Sentiment Drift Detected for '{aspect_name}'**\n"
-            consistency_report += f"  - Document 1: {sentiment1.title()}\n"
-            consistency_report += f"  - Document 2: {sentiment2.title()}\n\n"
             found_drift = True
-        elif sentiment1 and not sentiment2:
-            consistency_report += f"⚪️ **'{aspect_name}'** only found in Document 1 (Sentiment: {sentiment1.title()})\n\n"
-        elif not sentiment1 and sentiment2:
-            consistency_report += f"⚪️ **'{aspect_name}'** only found in Document 2 (Sentiment: {sentiment2.title()})\n\n"
-    if not found_drift:
-        consistency_report += "✅ No sentiment contradictions detected between the two documents for common aspects.\n"
-    # --- Discover New Potential Aspects ---
     all_text = text1 + "\n\n" + text2
     existing_aspects = set(sentiments1.keys()) | set(sentiments2.keys())
-    new_aspect_suggestions = discover_new_aspects(all_text, existing_aspects)
-    suggestions_report = "\n--- Suggested New Aspects (from both documents) ---\n"
-    if new_aspect_suggestions:
-        suggestions_report += "\n".join(f"- {s}" for s in new_aspect_suggestions)
-    else:
-        suggestions_report += "No new aspects discovered."
-    return (report1 + report2 + consistency_report + suggestions_report, kg_file_path)
-iface = gr.Interface(
-    fn=compare_documents,
-    inputs=[
-        gr.Textbox(label="Input ESG Report Text 1 (e.g., 2022 Report)", lines=15, placeholder="Paste the first report here..."),
-        gr.Textbox(label="Input ESG Report Text 2 (e.g., 2023 Report)", lines=15, placeholder="Paste the second report here...")
-    ],
-    outputs=[
-        gr.Textbox(label="Cross-Document Consistency Analysis", lines=40),
-        gr.File(label="Download Knowledge Graph (RDF/Turtle)")
-    ],
-    title="ESG Cross-Document Sentiment Analysis & Weakly Supervised Aspect Discovery",
-    description="Compares two ESG reports for sentiment drift, generates a knowledge graph, and suggests new ESG aspects."
-)
 iface.launch()

 from rdflib import Graph, Literal, Namespace, URIRef
 from rdflib.namespace import RDF, RDFS
 from keybert import KeyBERT
+import pandas as pd
+import plotly.express as px
 # --- Model Configuration ---
 # To use a fine-tuned model, change this path to the directory where your model is saved.
     return sections
+def analyze_single_document(text, doc_name="Document"):
+    """Analyzes a single document and returns aspect-level sentiment and other metrics."""
     detected_sections = detect_sections(text)
     aspect_sentiments = defaultdict(list)
+    aspect_confidence = defaultdict(list)
+    optimism_bias_scores = defaultdict(list)
     for section in detected_sections:
         title = section['title']
         if not content.strip():
             continue
+        # Section-aware weighting (Bias Analysis)
+        # Increase weight for forward-looking/promise sections, decrease for results
+        tone_weight = 1.0
+        if any(keyword in title.lower() for keyword in ["strategy", "commitment", "goal"]):
+            tone_weight = 1.2  # Higher optimism bias likely
+        if any(keyword in title.lower() for keyword in ["results", "performance", "data"]):
+            tone_weight = 0.8  # Lower optimism bias likely
         mapping = adapter.map_term(content)
             sentiment = mapping['sentiment']
             for match in mapping['matches']:
                 aspect = match['mapped_to']
+                score = match['score']
                 aspect_sentiments[aspect].append(sentiment)
+                aspect_confidence[aspect].append(score)
+                # Calculate a simple optimism score
+                if sentiment == 'positive':
+                    optimism_bias_scores[aspect].append(tone_weight * score)
+                elif sentiment == 'negative':
+                    optimism_bias_scores[aspect].append(-1 * score) # Negative sentiment counts against optimism
+    # Aggregate results
     aggregated_sentiments = {}
+    avg_confidence = {}
+    final_optimism_bias = {}
     for aspect, sentiments in aspect_sentiments.items():
         if sentiments:
+            # Sentiment: most common
             aggregated_sentiments[aspect] = Counter(sentiments).most_common(1)[0][0]
+            # Confidence: average score for the aspect
+            avg_confidence[aspect] = sum(aspect_confidence[aspect]) / len(aspect_confidence[aspect])
+            # Optimism Bias: average of the weighted scores
+            if aspect in optimism_bias_scores:
+                final_optimism_bias[aspect] = sum(optimism_bias_scores[aspect]) / len(optimism_bias_scores[aspect])
+            else:
+                final_optimism_bias[aspect] = 0
+    # Create a DataFrame for visualization
+    df = pd.DataFrame({
+        'Aspect': [a.replace('_', ' ').title() for a in aggregated_sentiments.keys()],
+        'Sentiment': list(aggregated_sentiments.values()),
+        'Confidence': [avg_confidence.get(a, 0) for a in aggregated_sentiments.keys()],
+        'Optimism Bias': [final_optimism_bias.get(a, 0) for a in aggregated_sentiments.keys()],
+        'Document': doc_name
+    })
+    return aggregated_sentiments, df
 def discover_new_aspects(text, existing_aspects):
     """Discovers new potential ESG aspects from text using KeyBERT."""
     text = text.replace('\n', ' ')
     keywords = kw_model.extract_keywords(
+        text, keyphrase_ngram_range=(1, 3), stop_words='english',
+        use_mmr=True, diversity=0.7, top_n=10
     )
     suggested_aspects = []
     existing_aspect_labels = {aspect.replace('_', ' ') for aspect in existing_aspects}
     for keyword, score in keywords:
         if keyword.lower() not in existing_aspect_labels and len(keyword) > 5:
+             suggested_aspects.append(f"- **{keyword.title()}** (Confidence: {score:.2f})")
+    return "\n".join(suggested_aspects) if suggested_aspects else "No new aspects discovered."
 def generate_knowledge_graph(sentiments1, sentiments2):
     """Generates an RDF knowledge graph from sentiment analysis results."""
     g = Graph()
+    g.bind("esg", ESG); g.bind("doc", DOC); g.bind("rdf", RDF); g.bind("rdfs", RDFS)
+    # Document 1
     doc1_uri = DOC['report_1']
+    g.add((doc1_uri, RDF.type, ESG.Document)); g.add((doc1_uri, RDFS.label, Literal("Document 1")))
     for aspect, sentiment in sentiments1.items():
         aspect_uri = ESG[aspect]
+        g.add((doc1_uri, ESG.hasAspect, aspect_uri)); g.add((aspect_uri, RDF.type, ESG.Aspect))
         g.add((aspect_uri, RDFS.label, Literal(aspect.replace('_', ' ').title())))
         g.add((aspect_uri, ESG.hasSentiment, Literal(sentiment)))
+    # Document 2
     doc2_uri = DOC['report_2']
+    g.add((doc2_uri, RDF.type, ESG.Document)); g.add((doc2_uri, RDFS.label, Literal("Document 2")))
     for aspect, sentiment in sentiments2.items():
         aspect_uri = ESG[aspect]
+        g.add((doc2_uri, ESG.hasAspect, aspect_uri)); g.add((aspect_uri, RDF.type, ESG.Aspect))
         g.add((aspect_uri, RDFS.label, Literal(aspect.replace('_', ' ').title())))
         g.add((aspect_uri, ESG.hasSentiment, Literal(sentiment)))
     output_path = "esg_knowledge_graph.ttl"
     g.serialize(destination=output_path, format='turtle')
     return output_path
+def create_ontology_tree_view():
+    """Creates a markdown representation of the ontology hierarchy."""
+    tree = "**ESG Ontology Structure**\n\n"
+    parents = adapter.get_direct_parents()
+    children = defaultdict(list)
+    for child, parent in parents.items():
+        children[parent].append(child)
+    def build_tree(node, prefix=""):
+        nonlocal tree
+        tree += f"{prefix}- **{node.replace('_', ' ').title()}**\n"
+        if node in children:
+            for child in sorted(children[node]):
+                build_tree(child, prefix + "  ")
+    # Find root nodes (those that are parents but not children)
+    root_nodes = sorted(list(set(children.keys()) - set(parents.keys())))
+    for root in root_nodes:
+        build_tree(root)
+    return tree
+def analyze_and_compare(text1, text2):
+    """Main function to drive the analysis and comparison for the dashboard."""
+    # Analyze both documents
+    sentiments1, df1 = analyze_single_document(text1, "Document 1")
+    sentiments2, df2 = analyze_single_document(text2, "Document 2")
+    # --- Generate Comparison Reports ---
+    # 1. Cross-Document Consistency Analysis
+    consistency_report = "**Sentiment Drift Analysis**\n\n"
+    all_aspects = sorted(list(set(sentiments1.keys()) | set(sentiments2.keys())))
     found_drift = False
     for aspect in all_aspects:
+        s1 = sentiments1.get(aspect); s2 = sentiments2.get(aspect)
+        name = aspect.replace('_', ' ').title()
+        if s1 and s2 and s1 != s2:
+            consistency_report += f"🟡 **Drift in '{name}'**: `{s1.title()}` ⟶ `{s2.title()}`\n"
             found_drift = True
+        elif s1 and not s2:
+            consistency_report += f"⚪️ **'{name}'** only in Document 1 (Sentiment: {s1.title()})\n"
+        elif not s1 and s2:
+            consistency_report += f"⚪️ **'{name}'** only in Document 2 (Sentiment: {s2.title()})\n"
+    if not found_drift and any(all_aspects):
+        consistency_report += "✅ No sentiment contradictions detected for common aspects.\n"
+    elif not any(all_aspects):
+        consistency_report = "No aspects detected in either document."
+    # 2. Weakly Supervised Aspect Discovery
     all_text = text1 + "\n\n" + text2
     existing_aspects = set(sentiments1.keys()) | set(sentiments2.keys())
+    suggestions_report = "**Suggested New Aspects**\n\n" + discover_new_aspects(all_text, existing_aspects)
+    # --- Create Visualizations ---
+    combined_df = pd.concat([df1, df2])
+    # Sentiment Distribution Plot
+    sentiment_fig = None
+    if not combined_df.empty:
+        sentiment_counts = combined_df.groupby(['Document', 'Sentiment']).size().reset_index(name='Count')
+        sentiment_fig = px.bar(sentiment_counts, x='Document', y='Count', color='Sentiment',
+                               title="Sentiment Distribution Across Documents",
+                               color_discrete_map={'positive': '#2ca02c', 'negative': '#d62728', 'neutral': '#7f7f7f'})
+    # Bias & Confidence Plot
+    bias_fig = None
+    if not combined_df.empty:
+        bias_fig = px.scatter(combined_df, x='Confidence', y='Optimism Bias', color='Aspect',
+                              size=abs(combined_df['Optimism Bias']), hover_data=['Document'],
+                              title="Optimism Bias vs. Mapping Confidence")
+        bias_fig.add_hline(y=0, line_dash="dot", line_color="grey")
+    # Generate and save the knowledge graph
+    kg_file_path = generate_knowledge_graph(sentiments1, sentiments2)
+    return (consistency_report, suggestions_report, sentiment_fig, bias_fig, kg_file_path)
+# --- Gradio Interface ---
+with gr.Blocks(theme=gr.themes.Soft(), title="ESG Interpretability Dashboard") as iface:
+    gr.Markdown("# 🧩 ESG Interpretability Dashboard & Bias Analysis")
+    gr.Markdown("Compare ESG reports to analyze sentiment drift, discover new aspects, and visualize model interpretability metrics.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            text1 = gr.Textbox(label="Input ESG Report Text 1 (e.g., 2022 Report)", lines=20, placeholder="Paste the first report here...")
+        with gr.Column(scale=1):
+            text2 = gr.Textbox(label="Input ESG Report Text 2 (e.g., 2023 Report)", lines=20, placeholder="Paste the second report here...")
+    analyze_btn = gr.Button("Analyze & Compare Documents", variant="primary")
+    with gr.Tabs():
+        with gr.TabItem("📊 Analysis & Visualizations"):
+            with gr.Row():
+                sentiment_plot = gr.Plot(label="Sentiment Distribution")
+                bias_plot = gr.Plot(label="Bias & Confidence Analysis")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    consistency_output = gr.Markdown(label="Cross-Document Analysis")
+                with gr.Column(scale=1):
+                    suggestions_output = gr.Markdown(label="Weak Supervision Suggestions")
+        with gr.TabItem("🌳 Ontology & Knowledge Graph"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    ontology_tree = gr.Markdown(value=create_ontology_tree_view(), label="ESG Ontology Hierarchy")
+                with gr.Column(scale=1):
+                    kg_output = gr.File(label="Download Knowledge Graph (RDF/Turtle)")
+                    gr.Markdown("The knowledge graph represents the extracted aspects and sentiments from both documents in a machine-readable format (RDF Turtle). You can use it with graph databases like Neo4j or query it with SPARQL.")
+    analyze_btn.click(
+        fn=analyze_and_compare,
+        inputs=[text1, text2],
+        outputs=[consistency_output, suggestions_output, sentiment_plot, bias_plot, kg_output]
+    )
 iface.launch()

requirements.txt CHANGED Viewed

@@ -7,4 +7,6 @@ vaderSentiment
 huggingface-hub
 protobuf==3.20.0
 rdflib
-keybert

 huggingface-hub
 protobuf==3.20.0
 rdflib
+keybert
+pandas
+plotly