Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
1a99b8c
1
Parent(s):
f75f514
update
Browse files
app.py
CHANGED
|
@@ -38,35 +38,85 @@ HEAD = """
|
|
| 38 |
|
| 39 |
HTML = f"""
|
| 40 |
<div id="banner">
|
| 41 |
-
<h1
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
</div>
|
| 47 |
</div>
|
| 48 |
"""
|
| 49 |
|
| 50 |
CSS = """
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
div#banner {
|
| 52 |
display: flex;
|
| 53 |
flex-direction: column;
|
| 54 |
align-items: center;
|
| 55 |
justify-content: center;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
}
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
}
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
span p {
|
| 64 |
font-size: var(--block-info-text-size);
|
| 65 |
line-height: var(--line-sm);
|
| 66 |
color: var(--block-label-text-color);
|
| 67 |
}
|
| 68 |
}
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
.slider_input_container span {color: var(--body-text-color);}
|
| 71 |
.slider_input_container {
|
| 72 |
display: flex;
|
|
@@ -74,7 +124,8 @@ div#component-10 {
|
|
| 74 |
input {appearance: auto;}
|
| 75 |
}
|
| 76 |
}
|
| 77 |
-
|
|
|
|
| 78 |
justify-content: unset;
|
| 79 |
label {margin-right: var(--size-2);}
|
| 80 |
label span {
|
|
@@ -259,23 +310,23 @@ with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS,
|
|
| 259 |
|
| 260 |
gr.HTML(HTML)
|
| 261 |
|
| 262 |
-
|
| 263 |
-
def render_state(endpoint_state):
|
| 264 |
-
if endpoint_state == "Ready":
|
| 265 |
-
color = "green"
|
| 266 |
-
elif endpoint_state == "Server Error":
|
| 267 |
-
color = "red"
|
| 268 |
-
else:
|
| 269 |
-
color = "orange"
|
| 270 |
-
|
| 271 |
-
if endpoint_state != None:
|
| 272 |
-
gr.Markdown(f'🤖 {model_name} | Inference Endpoint State: <span style="color:{color}; font-weight: bold;">{endpoint_state}</span>')
|
| 273 |
-
|
| 274 |
-
with gr.Row():
|
| 275 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
with gr.Row():
|
| 277 |
-
steer_toggle = Toggle(label="Steering", info="Turn off to generate original outputs", value=True, interactive=True, scale=2)
|
| 278 |
-
coeff = gr.Slider(label="
|
| 279 |
|
| 280 |
@gr.on(inputs=[steer_toggle], outputs=[steer_toggle, coeff], triggers=[steer_toggle.change])
|
| 281 |
def update_toggle(toggle_value):
|
|
|
|
| 38 |
|
| 39 |
HTML = f"""
|
| 40 |
<div id="banner">
|
| 41 |
+
<h1><img src="/gradio_api/file=assets/rudder_3094973.png"> LLM Censorship Steering</h1>
|
| 42 |
+
|
| 43 |
+
<div id="links" class="row" style="margin-bottom: .8em;">
|
| 44 |
+
<i class="fa-solid fa-file-pdf fa-lg"></i><a href="https://arxiv.org/abs/2504.17130"> Paper</a>
|
| 45 |
+
<i class="fa-solid fa-blog fa-lg"></i><a href="https://hannahxchen.github.io/blog/2025/censorship-steering"> Blog Post</a>
|
| 46 |
+
<i class="fa-brands fa-github fa-lg"></i><a href="https://github.com/hannahxchen/llm-censorship-steering"> Code</a>
|
| 47 |
+
</div>
|
| 48 |
+
|
| 49 |
+
<div id="cover">
|
| 50 |
+
<img src="/gradio_api/file=assets/demo-cover.png">
|
| 51 |
</div>
|
| 52 |
</div>
|
| 53 |
"""
|
| 54 |
|
| 55 |
CSS = """
|
| 56 |
+
div.gradio-container .app {
|
| 57 |
+
max-width: 1600px !important;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
div#banner {
|
| 61 |
display: flex;
|
| 62 |
flex-direction: column;
|
| 63 |
align-items: center;
|
| 64 |
justify-content: center;
|
| 65 |
+
|
| 66 |
+
h1 {
|
| 67 |
+
font-size: 32px;
|
| 68 |
+
line-height: 1.35em;
|
| 69 |
+
margin-bottom: 0em;
|
| 70 |
+
display: flex;
|
| 71 |
+
|
| 72 |
+
img {
|
| 73 |
+
display: inline;
|
| 74 |
+
height: 1.35em;
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
div#cover img {
|
| 79 |
+
max-height: 130px;
|
| 80 |
+
padding-top: 0.5em;
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
@media (max-width: 500px) {
|
| 85 |
+
div#banner {
|
| 86 |
+
h1 {
|
| 87 |
+
font-size: 22px;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
div#links {
|
| 91 |
+
font-size: 14px;
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
div#model-state p {
|
| 96 |
+
font-size: 14px;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
}
|
| 100 |
+
|
| 101 |
+
div#main-components {
|
| 102 |
+
align-items: flex-end;
|
| 103 |
}
|
| 104 |
+
|
| 105 |
+
div#steering-toggle {
|
| 106 |
+
padding-top: 8px;
|
| 107 |
+
padding-bottom: 8px;
|
| 108 |
+
.toggle-label {
|
| 109 |
+
color: var(--body-text-color);
|
| 110 |
+
}
|
| 111 |
span p {
|
| 112 |
font-size: var(--block-info-text-size);
|
| 113 |
line-height: var(--line-sm);
|
| 114 |
color: var(--block-label-text-color);
|
| 115 |
}
|
| 116 |
}
|
| 117 |
+
|
| 118 |
+
div#coeff-slider {
|
| 119 |
+
padding-bottom: 5px;
|
| 120 |
.slider_input_container span {color: var(--body-text-color);}
|
| 121 |
.slider_input_container {
|
| 122 |
display: flex;
|
|
|
|
| 124 |
input {appearance: auto;}
|
| 125 |
}
|
| 126 |
}
|
| 127 |
+
|
| 128 |
+
div#coeff-slider .wrap .head {
|
| 129 |
justify-content: unset;
|
| 130 |
label {margin-right: var(--size-2);}
|
| 131 |
label span {
|
|
|
|
| 310 |
|
| 311 |
gr.HTML(HTML)
|
| 312 |
|
| 313 |
+
with gr.Row(elem_id="main-components"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
with gr.Column(scale=1):
|
| 315 |
+
@gr.render(inputs=endpoint_state, triggers=[endpoint_state.change])
|
| 316 |
+
def render_state(endpoint_state):
|
| 317 |
+
if endpoint_state == "Ready":
|
| 318 |
+
color = "green"
|
| 319 |
+
elif endpoint_state == "Server Error":
|
| 320 |
+
color = "red"
|
| 321 |
+
else:
|
| 322 |
+
color = "orange"
|
| 323 |
+
|
| 324 |
+
if endpoint_state != None:
|
| 325 |
+
gr.Markdown(f'🤖 {model_name} | Inference Endpoint State: <span style="color:{color}; font-weight: bold;">{endpoint_state}</span>', elem_id="model-state")
|
| 326 |
+
|
| 327 |
with gr.Row():
|
| 328 |
+
steer_toggle = Toggle(label="Steering", info="Turn off to generate original outputs", value=True, interactive=True, scale=2, elem_id="steering-toggle")
|
| 329 |
+
coeff = gr.Slider(label="Coefficient:", value=-1.0, minimum=-2, maximum=2, step=0.1, scale=8, show_reset_button=False, elem_id="coeff-slider")
|
| 330 |
|
| 331 |
@gr.on(inputs=[steer_toggle], outputs=[steer_toggle, coeff], triggers=[steer_toggle.change])
|
| 332 |
def update_toggle(toggle_value):
|