kylemontgomery commited on
Commit
af74d42
Β·
1 Parent(s): cf76c85

fix latex rendering

Browse files
Files changed (2) hide show
  1. app.py +161 -10
  2. requirements.txt +3 -0
app.py CHANGED
@@ -7,8 +7,24 @@ from typing import Any, Dict, List
7
 
8
  import gradio as gr
9
 
10
-
11
- # Regex to capture common LaTeX math delimiters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  MATH_PATTERN = re.compile(
13
  r"(\$\$(.*?)\$\$)" # $$block$$
14
  r"|(\$([^\$\n]+?)\$)" # $inline$
@@ -19,10 +35,12 @@ MATH_PATTERN = re.compile(
19
 
20
 
21
  def format_step_content(content: str) -> str:
22
- """Render content to simple HTML with basic LaTeX handling.
23
 
24
- This keeps things lightweight: escape non-math parts and preserve
25
- math segments literally so the viewer can copy/read the LaTeX.
 
 
26
 
27
  Args:
28
  content: Raw text content possibly containing LaTeX snippets.
@@ -33,23 +51,133 @@ def format_step_content(content: str) -> str:
33
  if not content:
34
  return ""
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  text = content
 
37
  rendered_parts: list[str] = []
38
  last_idx = 0
39
 
40
  for m in MATH_PATTERN.finditer(text):
41
- # Non-math prefix
42
  prefix = text[last_idx : m.start()]
43
  if prefix:
 
 
 
 
44
  rendered_parts.append(html.escape(prefix).replace("\n", "<br>"))
45
 
46
- # Math segment (kept as-is, escaped for HTML visibility)
47
- rendered_parts.append(html.escape(m.group(0)).replace("\n", "<br>"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  last_idx = m.end()
49
 
50
- # Trailing non-math
51
  tail = text[last_idx:]
52
  if tail:
 
 
 
53
  rendered_parts.append(html.escape(tail).replace("\n", "<br>"))
54
 
55
  return "".join(rendered_parts)
@@ -292,7 +420,30 @@ def create_gradio_interface(data_dir: str):
292
  """
293
  runs = list_runs(data_dir)
294
 
295
- with gr.Blocks(title="Simple Episode Viewer") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  with gr.Group():
297
  with gr.Row():
298
  run_dropdown = gr.Dropdown(choices=runs, value=None, label="Run (subdirectory)")
 
7
 
8
  import gradio as gr
9
 
10
+ # Optional LaTeX β†’ MathML conversion (install: pip install latex2mathml)
11
+ try:
12
+ from latex2mathml.converter import convert as _latex_to_mathml
13
+ except Exception: # pragma: no cover
14
+ _latex_to_mathml = None
15
+
16
+ # Optional Markdown renderer (install: pip install markdown)
17
+ try:
18
+ import markdown as _markdown
19
+ except Exception: # pragma: no cover
20
+ _markdown = None
21
+
22
+ # Precompile math pattern once to avoid recompilation on every render
23
+ # Captures four math forms with inner content groups preserved:
24
+ # 1) $$ ... $$ β†’ group2
25
+ # 2) $ ... $ β†’ group4
26
+ # 3) \[ ... \] β†’ group6
27
+ # 4) \( ... \) β†’ group8
28
  MATH_PATTERN = re.compile(
29
  r"(\$\$(.*?)\$\$)" # $$block$$
30
  r"|(\$([^\$\n]+?)\$)" # $inline$
 
35
 
36
 
37
  def format_step_content(content: str) -> str:
38
+ """Render content to HTML with Markdown and LaTeX support.
39
 
40
+ If the optional `markdown` package is available, we tokenize LaTeX spans,
41
+ render Markdown so headings/lists/emphasis work, then substitute tokens
42
+ with MathML (via latex2mathml). Otherwise, we fall back to escaping with
43
+ inline MathML conversion.
44
 
45
  Args:
46
  content: Raw text content possibly containing LaTeX snippets.
 
51
  if not content:
52
  return ""
53
 
54
+ # Use precompiled regex to capture block and inline math (see MATH_PATTERN above)
55
+
56
+ if _markdown is not None:
57
+ # Tokenize math, render Markdown, then substitute tokens with MathML
58
+ parts: list[str] = []
59
+ token_to_html: dict[str, str] = {}
60
+ last_idx = 0
61
+ token_index = 0
62
+
63
+ for m in MATH_PATTERN.finditer(content):
64
+ # Non-math prefix: normalize light TeX-ish helpers for MD
65
+ prefix = content[last_idx : m.start()]
66
+ if prefix:
67
+ prefix = re.sub(r"\\text\{([^}]*)\}", r"\1", prefix)
68
+ prefix = re.sub(r"\\emph\{([^}]*)\}", r"*\1*", prefix)
69
+ prefix = re.sub(r"\\Bbb\{([^}]*)\}", r"\\mathbb{\1}", prefix)
70
+ parts.append(prefix)
71
+
72
+ # Extract LaTeX
73
+ latex_src = None
74
+ display = False
75
+ if m.group(2) is not None:
76
+ latex_src = m.group(2)
77
+ display = True
78
+ elif m.group(4) is not None:
79
+ latex_src = m.group(4)
80
+ display = False
81
+ elif m.group(6) is not None:
82
+ latex_src = m.group(6)
83
+ display = True
84
+ elif m.group(8) is not None:
85
+ latex_src = m.group(8)
86
+ display = False
87
+
88
+ token = f"[[[MATH_TOKEN_{token_index}]]]"
89
+ token_index += 1
90
+ if latex_src is None:
91
+ token_to_html[token] = html.escape(m.group(0)).replace("\n", "<br>")
92
+ else:
93
+ try:
94
+ if _latex_to_mathml is not None:
95
+ mathml = _latex_to_mathml(latex_src)
96
+ if display and mathml.startswith("<math") and " display=" not in mathml:
97
+ mathml = mathml.replace("<math", '<math display="block"', 1)
98
+ token_to_html[token] = mathml
99
+ else:
100
+ token_to_html[token] = html.escape(m.group(0)).replace("\n", "<br>")
101
+ except Exception:
102
+ token_to_html[token] = html.escape(m.group(0)).replace("\n", "<br>")
103
+
104
+ parts.append(token)
105
+ last_idx = m.end()
106
+
107
+ # Trailing non-math
108
+ tail = content[last_idx:]
109
+ if tail:
110
+ tail = re.sub(r"\\text\{([^}]*)\}", r"\1", tail)
111
+ tail = re.sub(r"\\emph\{([^}]*)\}", r"*\1*", tail)
112
+ tail = re.sub(r"\\Bbb\{([^}]*)\}", r"\\mathbb{\1}", tail)
113
+ parts.append(tail)
114
+
115
+ text_with_tokens = "".join(parts)
116
+ try:
117
+ html_out = _markdown.markdown(text_with_tokens, extensions=["extra", "sane_lists", "nl2br"])
118
+ except Exception:
119
+ html_out = html.escape(text_with_tokens).replace("\n", "<br>")
120
+
121
+ for token, token_html in token_to_html.items():
122
+ html_out = html_out.replace(token, token_html)
123
+ return html_out
124
+
125
+ # Fallback: previous approach (safe HTML escaping + optional MathML)
126
  text = content
127
+
128
  rendered_parts: list[str] = []
129
  last_idx = 0
130
 
131
  for m in MATH_PATTERN.finditer(text):
132
+ # Add preceding non-math segment (escaped, with mild TeX tweaks)
133
  prefix = text[last_idx : m.start()]
134
  if prefix:
135
+ # In non-math, normalize a few TeX-ish helpers
136
+ prefix = re.sub(r"\\text\{([^}]*)\}", r"\1", prefix)
137
+ prefix = re.sub(r"\\emph\{([^}]*)\}", r"<em>\1</em>", prefix)
138
+ prefix = re.sub(r"\\Bbb\{([^}]*)\}", r"\\mathbb{\1}", prefix)
139
  rendered_parts.append(html.escape(prefix).replace("\n", "<br>"))
140
 
141
+ # Determine which group matched and extract LaTeX
142
+ latex_src = None
143
+ display = False
144
+ if m.group(2) is not None: # $$ ... $$
145
+ latex_src = m.group(2)
146
+ display = True
147
+ elif m.group(4) is not None: # $ ... $
148
+ latex_src = m.group(4)
149
+ display = False
150
+ elif m.group(6) is not None: # \[ ... \]
151
+ latex_src = m.group(6)
152
+ display = True
153
+ elif m.group(8) is not None: # \( ... \)
154
+ latex_src = m.group(8)
155
+ display = False
156
+
157
+ if latex_src is None:
158
+ # Should not happen; just append raw match safely
159
+ rendered_parts.append(html.escape(m.group(0)).replace("\n", "<br>"))
160
+ else:
161
+ try:
162
+ mathml = _latex_to_mathml(latex_src)
163
+ # Ensure block math displays as block
164
+ if display and mathml.startswith("<math"):
165
+ if " display=" not in mathml:
166
+ mathml = mathml.replace("<math", '<math display="block"', 1)
167
+ rendered_parts.append(mathml)
168
+ except Exception:
169
+ # On failure, fall back to showing the LaTeX literally
170
+ fallback = html.escape(m.group(0)).replace("\n", "<br>")
171
+ rendered_parts.append(fallback)
172
+
173
  last_idx = m.end()
174
 
175
+ # Trailing non-math segment
176
  tail = text[last_idx:]
177
  if tail:
178
+ tail = re.sub(r"\\text\{([^}]*)\}", r"\1", tail)
179
+ tail = re.sub(r"\\emph\{([^}]*)\}", r"<em>\1</em>", tail)
180
+ tail = re.sub(r"\\Bbb\{([^}]*)\}", r"\\mathbb{\1}", tail)
181
  rendered_parts.append(html.escape(tail).replace("\n", "<br>"))
182
 
183
  return "".join(rendered_parts)
 
420
  """
421
  runs = list_runs(data_dir)
422
 
423
+ custom_css = """
424
+ /* ─── force global light theme & readable text ─────────────── */
425
+ :root, html, body, #root, .gradio-container{
426
+ background:#ffffff !important;
427
+ color-scheme:light;
428
+ color:#111 !important;
429
+ }
430
+ .gradio-container{
431
+ --body-background-fill:#ffffff;
432
+ --background-fill-primary:#ffffff;
433
+ --background-fill-secondary:#ffffff;
434
+ --block-background-fill:#ffffff;
435
+ --panel-background-fill:#ffffff;
436
+ }
437
+
438
+ /* ─── normalize MathML text color to match content ─────────── */
439
+ math, math *{
440
+ color:#333 !important;
441
+ fill:#333 !important;
442
+ stroke:#333 !important;
443
+ }
444
+ """
445
+
446
+ with gr.Blocks(title="Simple Episode Viewer", css=custom_css) as demo:
447
  with gr.Group():
448
  with gr.Row():
449
  run_dropdown = gr.Dropdown(choices=runs, value=None, label="Run (subdirectory)")
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ markdown
3
+ latex2mathml