darisdzakwanhoesien2 commited on
Commit
8cd5464
·
1 Parent(s): ef22374

Stage 10 — Interpretability Dashboard & Bias Analysis

Browse files
Files changed (2) hide show
  1. app.py +159 -95
  2. requirements.txt +3 -1
app.py CHANGED
@@ -5,6 +5,8 @@ from collections import Counter, defaultdict
5
  from rdflib import Graph, Literal, Namespace, URIRef
6
  from rdflib.namespace import RDF, RDFS
7
  from keybert import KeyBERT
 
 
8
 
9
  # --- Model Configuration ---
10
  # To use a fine-tuned model, change this path to the directory where your model is saved.
@@ -53,10 +55,12 @@ def detect_sections(text):
53
 
54
  return sections
55
 
56
- def analyze_single_document(text):
57
- """Analyzes a single document and returns aspect-level sentiment."""
58
  detected_sections = detect_sections(text)
59
  aspect_sentiments = defaultdict(list)
 
 
60
 
61
  for section in detected_sections:
62
  title = section['title']
@@ -65,9 +69,13 @@ def analyze_single_document(text):
65
  if not content.strip():
66
  continue
67
 
68
- weight = 1.0
69
- if any(keyword in title.lower() for keyword in ["results", "performance"]):
70
- weight = 1.5
 
 
 
 
71
 
72
  mapping = adapter.map_term(content)
73
 
@@ -75,148 +83,204 @@ def analyze_single_document(text):
75
  sentiment = mapping['sentiment']
76
  for match in mapping['matches']:
77
  aspect = match['mapped_to']
 
 
78
  aspect_sentiments[aspect].append(sentiment)
 
 
 
 
 
 
 
79
 
80
- # Aggregate sentiments for each aspect (e.g., by taking the most common one)
81
  aggregated_sentiments = {}
 
 
 
82
  for aspect, sentiments in aspect_sentiments.items():
83
  if sentiments:
 
84
  aggregated_sentiments[aspect] = Counter(sentiments).most_common(1)[0][0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- return aggregated_sentiments
87
 
88
  def discover_new_aspects(text, existing_aspects):
89
  """Discovers new potential ESG aspects from text using KeyBERT."""
90
-
91
- # Clean up text for better keyword extraction
92
  text = text.replace('\n', ' ')
93
-
94
- # Extract keywords/keyphrases
95
  keywords = kw_model.extract_keywords(
96
- text,
97
- keyphrase_ngram_range=(1, 3), # Consider phrases up to 3 words
98
- stop_words='english',
99
- use_mmr=True, # Use Maximal Marginal Relevance to diversify results
100
- diversity=0.7,
101
- top_n=10 # Get top 10 candidates
102
  )
103
 
104
- # Filter out keywords that are too similar to existing aspects
105
  suggested_aspects = []
106
  existing_aspect_labels = {aspect.replace('_', ' ') for aspect in existing_aspects}
107
 
108
  for keyword, score in keywords:
109
- # Simple check to see if the keyword is already in our ontology
110
- # A more advanced check would use semantic similarity.
111
  if keyword.lower() not in existing_aspect_labels and len(keyword) > 5:
112
- suggested_aspects.append(f"**{keyword.title()}** (Score: {score:.2f})")
113
 
114
- return suggested_aspects
115
 
116
  def generate_knowledge_graph(sentiments1, sentiments2):
117
  """Generates an RDF knowledge graph from sentiment analysis results."""
118
  g = Graph()
119
- g.bind("esg", ESG)
120
- g.bind("doc", DOC)
121
- g.bind("rdf", RDF)
122
- g.bind("rdfs", RDFS)
123
 
124
- # Add triples for Document 1
125
  doc1_uri = DOC['report_1']
126
- g.add((doc1_uri, RDF.type, ESG.Document))
127
- g.add((doc1_uri, RDFS.label, Literal("Document 1")))
128
  for aspect, sentiment in sentiments1.items():
129
  aspect_uri = ESG[aspect]
130
- g.add((doc1_uri, ESG.hasAspect, aspect_uri))
131
- g.add((aspect_uri, RDF.type, ESG.Aspect))
132
  g.add((aspect_uri, RDFS.label, Literal(aspect.replace('_', ' ').title())))
133
  g.add((aspect_uri, ESG.hasSentiment, Literal(sentiment)))
134
 
135
- # Add triples for Document 2
136
  doc2_uri = DOC['report_2']
137
- g.add((doc2_uri, RDF.type, ESG.Document))
138
- g.add((doc2_uri, RDFS.label, Literal("Document 2")))
139
  for aspect, sentiment in sentiments2.items():
140
  aspect_uri = ESG[aspect]
141
- g.add((doc2_uri, ESG.hasAspect, aspect_uri))
142
- g.add((aspect_uri, RDF.type, ESG.Aspect))
143
  g.add((aspect_uri, RDFS.label, Literal(aspect.replace('_', ' ').title())))
144
  g.add((aspect_uri, ESG.hasSentiment, Literal(sentiment)))
145
 
146
- # Save graph to a file
147
  output_path = "esg_knowledge_graph.ttl"
148
  g.serialize(destination=output_path, format='turtle')
149
  return output_path
150
 
151
- def compare_documents(text1, text2):
152
- """Compares two documents for sentiment consistency and generates a knowledge graph."""
153
-
154
- # Analyze both documents
155
- sentiments1 = analyze_single_document(text1)
156
- sentiments2 = analyze_single_document(text2)
157
-
158
- # Generate and save the knowledge graph
159
- kg_file_path = generate_knowledge_graph(sentiments1, sentiments2)
 
 
 
 
 
160
 
161
- # --- Generate formatted output for each document ---
162
- report1 = "--- Document 1 Analysis ---\n"
163
- for aspect, sentiment in sentiments1.items():
164
- report1 += f"- **{aspect.replace('_', ' ').title()}**: {sentiment.title()}\n"
165
 
166
- report2 = "\n--- Document 2 Analysis ---\n"
167
- for aspect, sentiment in sentiments2.items():
168
- report2 += f"- **{aspect.replace('_', ' ').title()}**: {sentiment.title()}\n"
169
 
170
- # --- Cross-Document Consistency Analysis ---
171
- consistency_report = "\n--- Cross-Document Consistency Analysis ---\n"
172
- all_aspects = sorted(list(set(sentiments1.keys()) | set(sentiments2.keys())))
 
 
 
173
 
 
 
 
 
174
  found_drift = False
175
  for aspect in all_aspects:
176
- sentiment1 = sentiments1.get(aspect)
177
- sentiment2 = sentiments2.get(aspect)
178
-
179
- aspect_name = aspect.replace('_', ' ').title()
180
-
181
- if sentiment1 and sentiment2 and sentiment1 != sentiment2:
182
- consistency_report += f"🟡 **Sentiment Drift Detected for '{aspect_name}'**\n"
183
- consistency_report += f" - Document 1: {sentiment1.title()}\n"
184
- consistency_report += f" - Document 2: {sentiment2.title()}\n\n"
185
  found_drift = True
186
- elif sentiment1 and not sentiment2:
187
- consistency_report += f"⚪️ **'{aspect_name}'** only found in Document 1 (Sentiment: {sentiment1.title()})\n\n"
188
- elif not sentiment1 and sentiment2:
189
- consistency_report += f"⚪️ **'{aspect_name}'** only found in Document 2 (Sentiment: {sentiment2.title()})\n\n"
 
 
 
 
190
 
191
- if not found_drift:
192
- consistency_report += "✅ No sentiment contradictions detected between the two documents for common aspects.\n"
193
-
194
- # --- Discover New Potential Aspects ---
195
  all_text = text1 + "\n\n" + text2
196
  existing_aspects = set(sentiments1.keys()) | set(sentiments2.keys())
197
- new_aspect_suggestions = discover_new_aspects(all_text, existing_aspects)
 
 
 
198
 
199
- suggestions_report = "\n--- Suggested New Aspects (from both documents) ---\n"
200
- if new_aspect_suggestions:
201
- suggestions_report += "\n".join(f"- {s}" for s in new_aspect_suggestions)
202
- else:
203
- suggestions_report += "No new aspects discovered."
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- return (report1 + report2 + consistency_report + suggestions_report, kg_file_path)
206
-
207
-
208
- iface = gr.Interface(
209
- fn=compare_documents,
210
- inputs=[
211
- gr.Textbox(label="Input ESG Report Text 1 (e.g., 2022 Report)", lines=15, placeholder="Paste the first report here..."),
212
- gr.Textbox(label="Input ESG Report Text 2 (e.g., 2023 Report)", lines=15, placeholder="Paste the second report here...")
213
- ],
214
- outputs=[
215
- gr.Textbox(label="Cross-Document Consistency Analysis", lines=40),
216
- gr.File(label="Download Knowledge Graph (RDF/Turtle)")
217
- ],
218
- title="ESG Cross-Document Sentiment Analysis & Weakly Supervised Aspect Discovery",
219
- description="Compares two ESG reports for sentiment drift, generates a knowledge graph, and suggests new ESG aspects."
220
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  iface.launch()
 
5
  from rdflib import Graph, Literal, Namespace, URIRef
6
  from rdflib.namespace import RDF, RDFS
7
  from keybert import KeyBERT
8
+ import pandas as pd
9
+ import plotly.express as px
10
 
11
  # --- Model Configuration ---
12
  # To use a fine-tuned model, change this path to the directory where your model is saved.
 
55
 
56
  return sections
57
 
58
+ def analyze_single_document(text, doc_name="Document"):
59
+ """Analyzes a single document and returns aspect-level sentiment and other metrics."""
60
  detected_sections = detect_sections(text)
61
  aspect_sentiments = defaultdict(list)
62
+ aspect_confidence = defaultdict(list)
63
+ optimism_bias_scores = defaultdict(list)
64
 
65
  for section in detected_sections:
66
  title = section['title']
 
69
  if not content.strip():
70
  continue
71
 
72
+ # Section-aware weighting (Bias Analysis)
73
+ # Increase weight for forward-looking/promise sections, decrease for results
74
+ tone_weight = 1.0
75
+ if any(keyword in title.lower() for keyword in ["strategy", "commitment", "goal"]):
76
+ tone_weight = 1.2 # Higher optimism bias likely
77
+ if any(keyword in title.lower() for keyword in ["results", "performance", "data"]):
78
+ tone_weight = 0.8 # Lower optimism bias likely
79
 
80
  mapping = adapter.map_term(content)
81
 
 
83
  sentiment = mapping['sentiment']
84
  for match in mapping['matches']:
85
  aspect = match['mapped_to']
86
+ score = match['score']
87
+
88
  aspect_sentiments[aspect].append(sentiment)
89
+ aspect_confidence[aspect].append(score)
90
+
91
+ # Calculate a simple optimism score
92
+ if sentiment == 'positive':
93
+ optimism_bias_scores[aspect].append(tone_weight * score)
94
+ elif sentiment == 'negative':
95
+ optimism_bias_scores[aspect].append(-1 * score) # Negative sentiment counts against optimism
96
 
97
+ # Aggregate results
98
  aggregated_sentiments = {}
99
+ avg_confidence = {}
100
+ final_optimism_bias = {}
101
+
102
  for aspect, sentiments in aspect_sentiments.items():
103
  if sentiments:
104
+ # Sentiment: most common
105
  aggregated_sentiments[aspect] = Counter(sentiments).most_common(1)[0][0]
106
+ # Confidence: average score for the aspect
107
+ avg_confidence[aspect] = sum(aspect_confidence[aspect]) / len(aspect_confidence[aspect])
108
+ # Optimism Bias: average of the weighted scores
109
+ if aspect in optimism_bias_scores:
110
+ final_optimism_bias[aspect] = sum(optimism_bias_scores[aspect]) / len(optimism_bias_scores[aspect])
111
+ else:
112
+ final_optimism_bias[aspect] = 0
113
+
114
+ # Create a DataFrame for visualization
115
+ df = pd.DataFrame({
116
+ 'Aspect': [a.replace('_', ' ').title() for a in aggregated_sentiments.keys()],
117
+ 'Sentiment': list(aggregated_sentiments.values()),
118
+ 'Confidence': [avg_confidence.get(a, 0) for a in aggregated_sentiments.keys()],
119
+ 'Optimism Bias': [final_optimism_bias.get(a, 0) for a in aggregated_sentiments.keys()],
120
+ 'Document': doc_name
121
+ })
122
 
123
+ return aggregated_sentiments, df
124
 
125
  def discover_new_aspects(text, existing_aspects):
126
  """Discovers new potential ESG aspects from text using KeyBERT."""
 
 
127
  text = text.replace('\n', ' ')
 
 
128
  keywords = kw_model.extract_keywords(
129
+ text, keyphrase_ngram_range=(1, 3), stop_words='english',
130
+ use_mmr=True, diversity=0.7, top_n=10
 
 
 
 
131
  )
132
 
 
133
  suggested_aspects = []
134
  existing_aspect_labels = {aspect.replace('_', ' ') for aspect in existing_aspects}
135
 
136
  for keyword, score in keywords:
 
 
137
  if keyword.lower() not in existing_aspect_labels and len(keyword) > 5:
138
+ suggested_aspects.append(f"- **{keyword.title()}** (Confidence: {score:.2f})")
139
 
140
+ return "\n".join(suggested_aspects) if suggested_aspects else "No new aspects discovered."
141
 
142
  def generate_knowledge_graph(sentiments1, sentiments2):
143
  """Generates an RDF knowledge graph from sentiment analysis results."""
144
  g = Graph()
145
+ g.bind("esg", ESG); g.bind("doc", DOC); g.bind("rdf", RDF); g.bind("rdfs", RDFS)
 
 
 
146
 
147
+ # Document 1
148
  doc1_uri = DOC['report_1']
149
+ g.add((doc1_uri, RDF.type, ESG.Document)); g.add((doc1_uri, RDFS.label, Literal("Document 1")))
 
150
  for aspect, sentiment in sentiments1.items():
151
  aspect_uri = ESG[aspect]
152
+ g.add((doc1_uri, ESG.hasAspect, aspect_uri)); g.add((aspect_uri, RDF.type, ESG.Aspect))
 
153
  g.add((aspect_uri, RDFS.label, Literal(aspect.replace('_', ' ').title())))
154
  g.add((aspect_uri, ESG.hasSentiment, Literal(sentiment)))
155
 
156
+ # Document 2
157
  doc2_uri = DOC['report_2']
158
+ g.add((doc2_uri, RDF.type, ESG.Document)); g.add((doc2_uri, RDFS.label, Literal("Document 2")))
 
159
  for aspect, sentiment in sentiments2.items():
160
  aspect_uri = ESG[aspect]
161
+ g.add((doc2_uri, ESG.hasAspect, aspect_uri)); g.add((aspect_uri, RDF.type, ESG.Aspect))
 
162
  g.add((aspect_uri, RDFS.label, Literal(aspect.replace('_', ' ').title())))
163
  g.add((aspect_uri, ESG.hasSentiment, Literal(sentiment)))
164
 
 
165
  output_path = "esg_knowledge_graph.ttl"
166
  g.serialize(destination=output_path, format='turtle')
167
  return output_path
168
 
169
+ def create_ontology_tree_view():
170
+ """Creates a markdown representation of the ontology hierarchy."""
171
+ tree = "**ESG Ontology Structure**\n\n"
172
+ parents = adapter.get_direct_parents()
173
+ children = defaultdict(list)
174
+ for child, parent in parents.items():
175
+ children[parent].append(child)
176
+
177
+ def build_tree(node, prefix=""):
178
+ nonlocal tree
179
+ tree += f"{prefix}- **{node.replace('_', ' ').title()}**\n"
180
+ if node in children:
181
+ for child in sorted(children[node]):
182
+ build_tree(child, prefix + " ")
183
 
184
+ # Find root nodes (those that are parents but not children)
185
+ root_nodes = sorted(list(set(children.keys()) - set(parents.keys())))
186
+ for root in root_nodes:
187
+ build_tree(root)
188
 
189
+ return tree
 
 
190
 
191
+ def analyze_and_compare(text1, text2):
192
+ """Main function to drive the analysis and comparison for the dashboard."""
193
+
194
+ # Analyze both documents
195
+ sentiments1, df1 = analyze_single_document(text1, "Document 1")
196
+ sentiments2, df2 = analyze_single_document(text2, "Document 2")
197
 
198
+ # --- Generate Comparison Reports ---
199
+ # 1. Cross-Document Consistency Analysis
200
+ consistency_report = "**Sentiment Drift Analysis**\n\n"
201
+ all_aspects = sorted(list(set(sentiments1.keys()) | set(sentiments2.keys())))
202
  found_drift = False
203
  for aspect in all_aspects:
204
+ s1 = sentiments1.get(aspect); s2 = sentiments2.get(aspect)
205
+ name = aspect.replace('_', ' ').title()
206
+ if s1 and s2 and s1 != s2:
207
+ consistency_report += f"🟡 **Drift in '{name}'**: `{s1.title()}` ⟶ `{s2.title()}`\n"
 
 
 
 
 
208
  found_drift = True
209
+ elif s1 and not s2:
210
+ consistency_report += f"⚪️ **'{name}'** only in Document 1 (Sentiment: {s1.title()})\n"
211
+ elif not s1 and s2:
212
+ consistency_report += f"⚪️ **'{name}'** only in Document 2 (Sentiment: {s2.title()})\n"
213
+ if not found_drift and any(all_aspects):
214
+ consistency_report += "✅ No sentiment contradictions detected for common aspects.\n"
215
+ elif not any(all_aspects):
216
+ consistency_report = "No aspects detected in either document."
217
 
218
+ # 2. Weakly Supervised Aspect Discovery
 
 
 
219
  all_text = text1 + "\n\n" + text2
220
  existing_aspects = set(sentiments1.keys()) | set(sentiments2.keys())
221
+ suggestions_report = "**Suggested New Aspects**\n\n" + discover_new_aspects(all_text, existing_aspects)
222
+
223
+ # --- Create Visualizations ---
224
+ combined_df = pd.concat([df1, df2])
225
 
226
+ # Sentiment Distribution Plot
227
+ sentiment_fig = None
228
+ if not combined_df.empty:
229
+ sentiment_counts = combined_df.groupby(['Document', 'Sentiment']).size().reset_index(name='Count')
230
+ sentiment_fig = px.bar(sentiment_counts, x='Document', y='Count', color='Sentiment',
231
+ title="Sentiment Distribution Across Documents",
232
+ color_discrete_map={'positive': '#2ca02c', 'negative': '#d62728', 'neutral': '#7f7f7f'})
233
+
234
+ # Bias & Confidence Plot
235
+ bias_fig = None
236
+ if not combined_df.empty:
237
+ bias_fig = px.scatter(combined_df, x='Confidence', y='Optimism Bias', color='Aspect',
238
+ size=abs(combined_df['Optimism Bias']), hover_data=['Document'],
239
+ title="Optimism Bias vs. Mapping Confidence")
240
+ bias_fig.add_hline(y=0, line_dash="dot", line_color="grey")
241
+
242
+ # Generate and save the knowledge graph
243
+ kg_file_path = generate_knowledge_graph(sentiments1, sentiments2)
244
 
245
+ return (consistency_report, suggestions_report, sentiment_fig, bias_fig, kg_file_path)
246
+
247
+
248
+ # --- Gradio Interface ---
249
+ with gr.Blocks(theme=gr.themes.Soft(), title="ESG Interpretability Dashboard") as iface:
250
+ gr.Markdown("# 🧩 ESG Interpretability Dashboard & Bias Analysis")
251
+ gr.Markdown("Compare ESG reports to analyze sentiment drift, discover new aspects, and visualize model interpretability metrics.")
252
+
253
+ with gr.Row():
254
+ with gr.Column(scale=1):
255
+ text1 = gr.Textbox(label="Input ESG Report Text 1 (e.g., 2022 Report)", lines=20, placeholder="Paste the first report here...")
256
+ with gr.Column(scale=1):
257
+ text2 = gr.Textbox(label="Input ESG Report Text 2 (e.g., 2023 Report)", lines=20, placeholder="Paste the second report here...")
258
+
259
+ analyze_btn = gr.Button("Analyze & Compare Documents", variant="primary")
260
+
261
+ with gr.Tabs():
262
+ with gr.TabItem("📊 Analysis & Visualizations"):
263
+ with gr.Row():
264
+ sentiment_plot = gr.Plot(label="Sentiment Distribution")
265
+ bias_plot = gr.Plot(label="Bias & Confidence Analysis")
266
+ with gr.Row():
267
+ with gr.Column(scale=1):
268
+ consistency_output = gr.Markdown(label="Cross-Document Analysis")
269
+ with gr.Column(scale=1):
270
+ suggestions_output = gr.Markdown(label="Weak Supervision Suggestions")
271
+
272
+ with gr.TabItem("🌳 Ontology & Knowledge Graph"):
273
+ with gr.Row():
274
+ with gr.Column(scale=1):
275
+ ontology_tree = gr.Markdown(value=create_ontology_tree_view(), label="ESG Ontology Hierarchy")
276
+ with gr.Column(scale=1):
277
+ kg_output = gr.File(label="Download Knowledge Graph (RDF/Turtle)")
278
+ gr.Markdown("The knowledge graph represents the extracted aspects and sentiments from both documents in a machine-readable format (RDF Turtle). You can use it with graph databases like Neo4j or query it with SPARQL.")
279
+
280
+ analyze_btn.click(
281
+ fn=analyze_and_compare,
282
+ inputs=[text1, text2],
283
+ outputs=[consistency_output, suggestions_output, sentiment_plot, bias_plot, kg_output]
284
+ )
285
 
286
  iface.launch()
requirements.txt CHANGED
@@ -7,4 +7,6 @@ vaderSentiment
7
  huggingface-hub
8
  protobuf==3.20.0
9
  rdflib
10
- keybert
 
 
 
7
  huggingface-hub
8
  protobuf==3.20.0
9
  rdflib
10
+ keybert
11
+ pandas
12
+ plotly