Luigi commited on
Commit
ba4a241
·
1 Parent(s): 09695c9

re-implement in FastAPI

Browse files
Dockerfile CHANGED
@@ -13,49 +13,29 @@ RUN apt-get update && apt-get install -y \
13
  libopenblas-dev \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
- # === CRITICAL FIX + PERFORMANCE OPTIMIZATIONS ===
17
- # Set Streamlit to use temporary directories for ALL storage
18
  ENV HOME=/tmp
19
- ENV STREAMLIT_GLOBAL_DEVELOPMENT_MODE=false
20
- ENV STREAMLIT_GLOBAL_DATA_PATH=/tmp
21
- ENV STREAMLIT_CONFIG_DIR=/tmp/.streamlit
22
  ENV HF_HOME=/tmp/huggingface
 
 
 
23
 
24
- # Create directories with open permissions including static audio directory
25
- RUN mkdir -p /tmp/.streamlit /tmp/huggingface /app/static && \
26
- chmod -R 777 /tmp /app/static
27
-
28
- # Create config file with proper settings for large file handling
29
- RUN mkdir -p /tmp/.streamlit && \
30
- cat <<EOF > /tmp/.streamlit/config.toml
31
- [browser]
32
- gatherUsageStats = false
33
-
34
- [server]
35
- enableCORS = false
36
- enableXsrfProtection = false
37
- maxUploadSize = 500
38
- maxMessageSize = 500
39
-
40
- [runner]
41
- maxCachedEntries = 1000
42
- fastReruns = true
43
- EOF
44
 
45
  # Copy files
46
  COPY requirements.txt ./
47
  COPY src/ ./src/
48
  COPY static/ ./static/
 
 
49
 
50
  # Install Python dependencies
51
  RUN pip3 install --no-cache-dir -r requirements.txt
52
 
53
- EXPOSE 8501
54
 
55
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
56
 
57
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", \
58
- "--server.port=8501", \
59
- "--server.address=0.0.0.0", \
60
- "--server.maxUploadSize=500", \
61
- "--server.maxMessageSize=500"]
 
13
  libopenblas-dev \
14
  && rm -rf /var/lib/apt/lists/*
15
 
 
 
16
  ENV HOME=/tmp
 
 
 
17
  ENV HF_HOME=/tmp/huggingface
18
+ ENV PYTHONUNBUFFERED=1
19
+ ENV PYTHONPATH="/app/src"
20
+ ENV PORT=7860
21
 
22
+ # Create writable directories used at runtime
23
+ RUN mkdir -p /tmp/huggingface /app/static /app/static/audio /app/tmp && \
24
+ chmod -R 777 /tmp /app/static /app/tmp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # Copy files
27
  COPY requirements.txt ./
28
  COPY src/ ./src/
29
  COPY static/ ./static/
30
+ COPY frontend/ ./frontend/
31
+ COPY models/ ./models/
32
 
33
  # Install Python dependencies
34
  RUN pip3 install --no-cache-dir -r requirements.txt
35
 
36
+ EXPOSE 7860
37
 
38
+ HEALTHCHECK CMD curl --fail http://localhost:7860/health || exit 1
39
 
40
+ ENTRYPOINT ["python", "-m", "uvicorn", "src.server.main:app"]
41
+ CMD ["--host", "0.0.0.0", "--port", "7860"]
 
 
 
frontend/app.js ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const state = {
2
+ config: { moonshine: {}, sensevoice: {}, llms: {} },
3
+ backend: 'sensevoice',
4
+ utterances: [],
5
+ diarizedUtterances: null,
6
+ diarizationStats: null,
7
+ summary: '',
8
+ audioUrl: null,
9
+ sourcePath: null,
10
+ uploadedFile: null,
11
+ transcribing: false,
12
+ summarizing: false,
13
+ };
14
+
15
+ const elements = {
16
+ backendSelect: document.getElementById('backend-select'),
17
+ modelSelect: document.getElementById('model-select'),
18
+ llmSelect: document.getElementById('llm-select'),
19
+ promptInput: document.getElementById('prompt-input'),
20
+ vadSlider: document.getElementById('vad-threshold'),
21
+ vadValue: document.getElementById('vad-value'),
22
+ diarizationToggle: document.getElementById('diarization-toggle'),
23
+ diarizationSettings: document.getElementById('diarization-settings'),
24
+ numSpeakers: document.getElementById('num-speakers'),
25
+ clusterSlider: document.getElementById('cluster-threshold'),
26
+ clusterValue: document.getElementById('cluster-value'),
27
+ sensevoiceOptions: document.getElementById('sensevoice-options'),
28
+ sensevoiceLanguage: document.getElementById('sensevoice-language'),
29
+ transcribeBtn: document.getElementById('transcribe-btn'),
30
+ summaryBtn: document.getElementById('summary-btn'),
31
+ statusText: document.getElementById('status-text'),
32
+ audioPlayer: document.getElementById('audio-player'),
33
+ transcriptList: document.getElementById('transcript-list'),
34
+ transcriptTemplate: document.getElementById('utterance-template'),
35
+ utteranceCount: document.getElementById('utterance-count'),
36
+ summaryOutput: document.getElementById('summary-output'),
37
+ diarizationPanel: document.getElementById('diarization-summary'),
38
+ diarizationMetrics: document.getElementById('diarization-metrics'),
39
+ speakerBreakdown: document.getElementById('speaker-breakdown'),
40
+ transcriptFormat: document.getElementById('transcript-format'),
41
+ summaryFormat: document.getElementById('summary-format'),
42
+ exportTranscriptBtn: document.getElementById('export-transcript'),
43
+ exportSummaryBtn: document.getElementById('export-summary'),
44
+ includeTimestamps: document.getElementById('include-timestamps'),
45
+ fileInput: document.getElementById('file-input'),
46
+ youtubeUrl: document.getElementById('youtube-url'),
47
+ youtubeFetch: document.getElementById('youtube-fetch'),
48
+ podcastQuery: document.getElementById('podcast-query'),
49
+ podcastSearch: document.getElementById('podcast-search'),
50
+ podcastResults: document.getElementById('podcast-results'),
51
+ episodeResults: document.getElementById('episode-results'),
52
+ };
53
+
54
+ const TRANSCRIPT_FORMATS = [
55
+ 'SRT (SubRip)',
56
+ 'VTT (WebVTT)',
57
+ 'ASS (Advanced SubStation Alpha)',
58
+ 'Plain Text',
59
+ 'JSON',
60
+ 'ELAN (EAF)',
61
+ ];
62
+
63
+ const SUMMARY_FORMATS = ['Markdown', 'Plain Text'];
64
+
65
+ let activeTab = 'podcast-tab';
66
+ let activeUtteranceIndex = -1;
67
+
68
+ function setStatus(message, tone = 'info') {
69
+ elements.statusText.textContent = message;
70
+ elements.statusText.dataset.tone = tone;
71
+ }
72
+
73
+ function formatTime(seconds) {
74
+ const mins = Math.floor(seconds / 60);
75
+ const secs = Math.floor(seconds % 60).toString().padStart(2, '0');
76
+ return `${mins}:${secs}`;
77
+ }
78
+
79
+ function setListEmpty(container, message) {
80
+ if (!container) return;
81
+ container.innerHTML = `<div class="empty-state">${message}</div>`;
82
+ }
83
+
84
+ async function fetchConfig() {
85
+ try {
86
+ const res = await fetch('/api/config/models');
87
+ if (!res.ok) throw new Error('Failed to fetch model catalog');
88
+ state.config = await res.json();
89
+ populateModelSelect();
90
+ populateLLMSelect();
91
+ populateExportSelects();
92
+ } catch (err) {
93
+ console.error(err);
94
+ setStatus(err.message, 'error');
95
+ }
96
+ }
97
+
98
+ function populateModelSelect() {
99
+ const backend = state.backend;
100
+ elements.modelSelect.innerHTML = '';
101
+ const models = backend === 'moonshine' ? state.config.moonshine : state.config.sensevoice;
102
+ Object.entries(models).forEach(([label, value]) => {
103
+ const option = document.createElement('option');
104
+ option.value = value;
105
+ option.textContent = label;
106
+ elements.modelSelect.appendChild(option);
107
+ });
108
+ if (elements.modelSelect.options.length > 0) {
109
+ elements.modelSelect.selectedIndex = 0;
110
+ }
111
+ elements.sensevoiceOptions.classList.toggle('hidden', backend !== 'sensevoice');
112
+ }
113
+
114
+ function populateLLMSelect() {
115
+ elements.llmSelect.innerHTML = '';
116
+ Object.keys(state.config.llms).forEach((name) => {
117
+ const option = document.createElement('option');
118
+ option.value = name;
119
+ option.textContent = name;
120
+ elements.llmSelect.appendChild(option);
121
+ });
122
+ }
123
+
124
+ function populateExportSelects() {
125
+ elements.transcriptFormat.innerHTML = '';
126
+ TRANSCRIPT_FORMATS.forEach((fmt) => {
127
+ const option = document.createElement('option');
128
+ option.value = fmt;
129
+ option.textContent = fmt;
130
+ elements.transcriptFormat.appendChild(option);
131
+ });
132
+
133
+ elements.summaryFormat.innerHTML = '';
134
+ SUMMARY_FORMATS.forEach((fmt) => {
135
+ const option = document.createElement('option');
136
+ option.value = fmt;
137
+ option.textContent = fmt;
138
+ elements.summaryFormat.appendChild(option);
139
+ });
140
+ }
141
+
142
+ function initTabs() {
143
+ document.querySelectorAll('.tab').forEach((tab) => {
144
+ tab.addEventListener('click', () => {
145
+ if (tab.dataset.target === activeTab) return;
146
+ document.querySelectorAll('.tab').forEach((btn) => btn.classList.remove('active'));
147
+ document.querySelectorAll('.tab-panel').forEach((panel) => panel.classList.remove('active'));
148
+ tab.classList.add('active');
149
+ document.getElementById(tab.dataset.target).classList.add('active');
150
+ activeTab = tab.dataset.target;
151
+ });
152
+ });
153
+ }
154
+
155
+ function initSidebarInteractions() {
156
+ elements.backendSelect.addEventListener('change', () => {
157
+ state.backend = elements.backendSelect.value;
158
+ populateModelSelect();
159
+ });
160
+
161
+ elements.vadSlider.addEventListener('input', () => {
162
+ elements.vadValue.textContent = Number(elements.vadSlider.value).toFixed(2);
163
+ });
164
+
165
+ elements.diarizationToggle.addEventListener('change', () => {
166
+ elements.diarizationSettings.classList.toggle('hidden', !elements.diarizationToggle.checked);
167
+ });
168
+
169
+ elements.clusterSlider.addEventListener('input', () => {
170
+ elements.clusterValue.textContent = Number(elements.clusterSlider.value).toFixed(2);
171
+ });
172
+ }
173
+
174
+ function resetTranscriptionState() {
175
+ state.utterances = [];
176
+ state.diarizedUtterances = null;
177
+ state.diarizationStats = null;
178
+ activeUtteranceIndex = -1;
179
+ elements.transcriptList.innerHTML = '';
180
+ elements.utteranceCount.textContent = '';
181
+ elements.diarizationPanel.classList.add('hidden');
182
+ }
183
+
184
+ function prepareTranscriptionOptions() {
185
+ const textnormValue = document.querySelector('input[name="textnorm"]:checked')?.value || 'withitn';
186
+ return {
187
+ backend: state.backend,
188
+ model_name: elements.modelSelect.value,
189
+ vad_threshold: Number(elements.vadSlider.value),
190
+ language: state.backend === 'sensevoice' ? elements.sensevoiceLanguage.value : 'auto',
191
+ textnorm: textnormValue,
192
+ diarization: {
193
+ enable: elements.diarizationToggle.checked,
194
+ num_speakers: Number(elements.numSpeakers.value || -1),
195
+ cluster_threshold: Number(elements.clusterSlider.value),
196
+ },
197
+ };
198
+ }
199
+
200
+ async function handleTranscription() {
201
+ if (state.transcribing) return;
202
+ if (!state.uploadedFile && !state.audioUrl) {
203
+ setStatus('Upload or select an audio source first', 'warning');
204
+ return;
205
+ }
206
+
207
+ resetTranscriptionState();
208
+ state.transcribing = true;
209
+ setStatus('Starting transcription...', 'info');
210
+
211
+ const formData = new FormData();
212
+ if (state.uploadedFile) {
213
+ formData.append('audio', state.uploadedFile, state.uploadedFile.name);
214
+ } else if (state.audioUrl) {
215
+ formData.append('source', state.audioUrl);
216
+ }
217
+ formData.append('options', JSON.stringify(prepareTranscriptionOptions()));
218
+
219
+ try {
220
+ const response = await fetch('/api/transcribe', {
221
+ method: 'POST',
222
+ body: formData,
223
+ });
224
+ if (!response.ok || !response.body) {
225
+ throw new Error('Transcription request failed');
226
+ }
227
+
228
+ const reader = response.body.getReader();
229
+ const decoder = new TextDecoder();
230
+ let buffer = '';
231
+ setStatus('Processing audio...', 'info');
232
+
233
+ while (true) {
234
+ const { done, value } = await reader.read();
235
+ if (done) break;
236
+ buffer += decoder.decode(value, { stream: true });
237
+ let lines = buffer.split('\n');
238
+ buffer = lines.pop();
239
+ for (const line of lines) {
240
+ if (!line.trim()) continue;
241
+ const event = JSON.parse(line);
242
+ handleTranscriptionEvent(event);
243
+ }
244
+ }
245
+
246
+ if (buffer.trim()) {
247
+ handleTranscriptionEvent(JSON.parse(buffer));
248
+ }
249
+
250
+ setStatus('Transcription complete', 'success');
251
+ } catch (err) {
252
+ console.error(err);
253
+ setStatus(err.message, 'error');
254
+ } finally {
255
+ state.transcribing = false;
256
+ }
257
+ }
258
+
259
+ function handleTranscriptionEvent(event) {
260
+ switch (event.type) {
261
+ case 'ready':
262
+ if (event.audioUrl) {
263
+ state.audioUrl = event.audioUrl;
264
+ elements.audioPlayer.src = event.audioUrl;
265
+ elements.audioPlayer.currentTime = 0;
266
+ }
267
+ break;
268
+ case 'utterance':
269
+ if (event.utterance) {
270
+ state.utterances.push(event.utterance);
271
+ renderTranscript();
272
+ }
273
+ break;
274
+ case 'complete':
275
+ if (event.diarization) {
276
+ state.diarizedUtterances = event.diarization.utterances || [];
277
+ state.diarizationStats = event.diarization.stats || null;
278
+ }
279
+ if (event.utterances) {
280
+ const diarized = state.diarizedUtterances?.length ? state.diarizedUtterances : null;
281
+ state.utterances = diarized
282
+ ? diarized.map((utt, index) => ({
283
+ ...(event.utterances[index] || {}),
284
+ ...utt,
285
+ }))
286
+ : event.utterances;
287
+ } else if (state.diarizedUtterances?.length) {
288
+ state.utterances = state.diarizedUtterances;
289
+ }
290
+ renderTranscript();
291
+ renderDiarizationStats();
292
+ break;
293
+ case 'error':
294
+ setStatus(event.message || 'Transcription error', 'error');
295
+ break;
296
+ }
297
+ }
298
+
299
+ function renderTranscript() {
300
+ elements.transcriptList.innerHTML = '';
301
+ const fragment = document.createDocumentFragment();
302
+ state.utterances.forEach((utt, index) => {
303
+ const node = elements.transcriptTemplate.content.cloneNode(true);
304
+ const item = node.querySelector('.utterance-item');
305
+ item.dataset.index = index.toString();
306
+ item.dataset.start = utt.start;
307
+ item.dataset.end = utt.end;
308
+
309
+ node.querySelector('.timestamp').textContent = `[${formatTime(utt.start)}]`;
310
+ node.querySelector('.utterance-text').textContent = utt.text;
311
+
312
+ const speakerTag = node.querySelector('.speaker-tag');
313
+ if (typeof utt.speaker === 'number') {
314
+ speakerTag.textContent = `Speaker ${utt.speaker + 1}`;
315
+ speakerTag.classList.remove('hidden');
316
+ }
317
+
318
+ fragment.appendChild(node);
319
+ });
320
+ elements.transcriptList.appendChild(fragment);
321
+ elements.utteranceCount.textContent = `${state.utterances.length} segments`;
322
+ }
323
+
324
+ function renderDiarizationStats() {
325
+ if (!state.diarizationStats) {
326
+ elements.diarizationPanel.classList.add('hidden');
327
+ return;
328
+ }
329
+ elements.diarizationPanel.classList.remove('hidden');
330
+ const stats = state.diarizationStats;
331
+
332
+ elements.diarizationMetrics.innerHTML = '';
333
+ const metricsFragment = document.createDocumentFragment();
334
+
335
+ const totalCard = document.createElement('div');
336
+ totalCard.className = 'metric-card';
337
+ totalCard.innerHTML = `<strong>Total speakers:</strong> ${stats.total_speakers || 0}<br/><strong>Duration:</strong> ${stats.total_duration?.toFixed(1) || 0}s`;
338
+ metricsFragment.appendChild(totalCard);
339
+ elements.diarizationMetrics.appendChild(metricsFragment);
340
+
341
+ elements.speakerBreakdown.innerHTML = '';
342
+ const speakersFragment = document.createDocumentFragment();
343
+ Object.entries(stats.speakers || {}).forEach(([speakerId, info]) => {
344
+ const card = document.createElement('div');
345
+ card.className = 'metric-card';
346
+ card.innerHTML = `
347
+ <strong>Speaker ${Number(speakerId) + 1}</strong><br/>
348
+ Speaking time: ${info.speaking_time.toFixed(1)}s<br/>
349
+ Percentage: ${info.percentage.toFixed(1)}%<br/>
350
+ Utterances: ${info.utterances}<br/>
351
+ Avg length: ${info.avg_utterance_length.toFixed(1)}s
352
+ `;
353
+ speakersFragment.appendChild(card);
354
+ });
355
+ elements.speakerBreakdown.appendChild(speakersFragment);
356
+ }
357
+
358
+ function findActiveUtterance(currentTime) {
359
+ let left = 0;
360
+ let right = state.utterances.length - 1;
361
+ let match = -1;
362
+ while (left <= right) {
363
+ const mid = Math.floor((left + right) / 2);
364
+ const utt = state.utterances[mid];
365
+ if (currentTime >= utt.start && currentTime < utt.end) {
366
+ return mid;
367
+ }
368
+ if (currentTime < utt.start) {
369
+ right = mid - 1;
370
+ } else {
371
+ match = mid;
372
+ left = mid + 1;
373
+ }
374
+ }
375
+ return match;
376
+ }
377
+
378
+ function updateActiveUtterance(index) {
379
+ if (index === activeUtteranceIndex) return;
380
+ const previous = elements.transcriptList.querySelector('.utterance-item.active');
381
+ if (previous) previous.classList.remove('active');
382
+ const current = elements.transcriptList.querySelector(`.utterance-item[data-index="${index}"]`);
383
+ if (current) {
384
+ current.classList.add('active');
385
+ current.scrollIntoView({ behavior: 'smooth', block: 'center' });
386
+ }
387
+ activeUtteranceIndex = index;
388
+ }
389
+
390
+ function initAudioInteractions() {
391
+ elements.audioPlayer.addEventListener('timeupdate', () => {
392
+ if (!state.utterances.length) return;
393
+ const idx = findActiveUtterance(elements.audioPlayer.currentTime);
394
+ if (idx >= 0) updateActiveUtterance(idx);
395
+ });
396
+
397
+ elements.transcriptList.addEventListener('click', (event) => {
398
+ const item = event.target.closest('.utterance-item');
399
+ if (!item) return;
400
+ const editButton = event.target.closest('.edit-btn');
401
+ const saveButton = event.target.closest('.save-edit');
402
+ const cancelButton = event.target.closest('.cancel-edit');
403
+
404
+ const index = Number(item.dataset.index);
405
+
406
+ if (editButton) {
407
+ toggleEdit(item, true);
408
+ return;
409
+ }
410
+
411
+ if (saveButton) {
412
+ const textarea = item.querySelector('textarea');
413
+ const newText = textarea.value.trim();
414
+ if (newText.length === 0) return;
415
+ state.utterances[index].text = newText;
416
+ item.querySelector('.utterance-text').textContent = newText;
417
+ toggleEdit(item, false);
418
+ return;
419
+ }
420
+
421
+ if (cancelButton) {
422
+ toggleEdit(item, false);
423
+ return;
424
+ }
425
+
426
+ const start = Number(item.dataset.start);
427
+ seekToTime(start);
428
+ });
429
+ }
430
+
431
+ function toggleEdit(item, editing) {
432
+ const textBlock = item.querySelector('.utterance-text');
433
+ const editArea = item.querySelector('.edit-area');
434
+ if (!textBlock || !editArea) return;
435
+
436
+ if (editing) {
437
+ const textarea = editArea.querySelector('textarea');
438
+ textarea.value = textBlock.textContent;
439
+ textBlock.classList.add('hidden');
440
+ editArea.classList.remove('hidden');
441
+ } else {
442
+ textBlock.classList.remove('hidden');
443
+ editArea.classList.add('hidden');
444
+ }
445
+ }
446
+
447
+ function seekToTime(timeInSeconds) {
448
+ if (!Number.isFinite(timeInSeconds)) return;
449
+ const audio = elements.audioPlayer;
450
+
451
+ const executeSeek = () => {
452
+ audio.currentTime = Math.max(0, timeInSeconds);
453
+ updateActiveUtterance(findActiveUtterance(audio.currentTime));
454
+ audio.play().catch(() => {});
455
+ };
456
+
457
+ if (audio.readyState >= 1) {
458
+ executeSeek();
459
+ } else {
460
+ const onLoaded = () => {
461
+ executeSeek();
462
+ audio.removeEventListener('loadedmetadata', onLoaded);
463
+ };
464
+ audio.addEventListener('loadedmetadata', onLoaded);
465
+ audio.load();
466
+ }
467
+ }
468
+
469
+ async function handleSummaryGeneration() {
470
+ if (state.summarizing || !state.utterances.length) return;
471
+ state.summarizing = true;
472
+ setStatus('Generating summary...', 'info');
473
+ elements.summaryOutput.textContent = '';
474
+
475
+ const payload = {
476
+ transcript: state.utterances.map((u) => u.text).join('\n'),
477
+ llm_model: elements.llmSelect.value,
478
+ prompt: elements.promptInput.value || 'Summarize the transcript below.',
479
+ };
480
+
481
+ try {
482
+ const response = await fetch('/api/summarize', {
483
+ method: 'POST',
484
+ headers: { 'Content-Type': 'application/json' },
485
+ body: JSON.stringify(payload),
486
+ });
487
+
488
+ if (!response.ok || !response.body) throw new Error('Failed to generate summary');
489
+
490
+ const reader = response.body.getReader();
491
+ const decoder = new TextDecoder();
492
+ let buffer = '';
493
+
494
+ while (true) {
495
+ const { done, value } = await reader.read();
496
+ if (done) break;
497
+ buffer += decoder.decode(value, { stream: true });
498
+ let lines = buffer.split('\n');
499
+ buffer = lines.pop();
500
+ for (const line of lines) {
501
+ if (!line.trim()) continue;
502
+ const event = JSON.parse(line);
503
+ if (event.type === 'partial' && event.content) {
504
+ elements.summaryOutput.textContent = event.content;
505
+ }
506
+ }
507
+ }
508
+
509
+ setStatus('Summary ready', 'success');
510
+ } catch (err) {
511
+ console.error(err);
512
+ setStatus(err.message, 'error');
513
+ } finally {
514
+ state.summarizing = false;
515
+ }
516
+ }
517
+
518
+ async function handleExportTranscript() {
519
+ if (!state.utterances.length) return;
520
+ const payload = {
521
+ format: elements.transcriptFormat.value,
522
+ include_timestamps: elements.includeTimestamps.checked,
523
+ utterances: state.utterances,
524
+ };
525
+ await downloadFile('/api/export/transcript', payload, 'transcript');
526
+ }
527
+
528
+ async function handleExportSummary() {
529
+ if (!elements.summaryOutput.textContent.trim()) return;
530
+ const payload = {
531
+ format: elements.summaryFormat.value,
532
+ summary: elements.summaryOutput.textContent,
533
+ metadata: {},
534
+ };
535
+ await downloadFile('/api/export/summary', payload, 'summary');
536
+ }
537
+
538
+ async function downloadFile(url, payload, prefix) {
539
+ try {
540
+ const response = await fetch(url, {
541
+ method: 'POST',
542
+ headers: { 'Content-Type': 'application/json' },
543
+ body: JSON.stringify(payload),
544
+ });
545
+ if (!response.ok) throw new Error('Export failed');
546
+ const blob = await response.blob();
547
+ const filename = getFilenameFromDisposition(response.headers.get('Content-Disposition')) || `${prefix}.txt`;
548
+ const link = document.createElement('a');
549
+ link.href = URL.createObjectURL(blob);
550
+ link.download = filename;
551
+ link.click();
552
+ URL.revokeObjectURL(link.href);
553
+ setStatus('Export complete', 'success');
554
+ } catch (err) {
555
+ console.error(err);
556
+ setStatus(err.message, 'error');
557
+ }
558
+ }
559
+
560
+ function getFilenameFromDisposition(disposition) {
561
+ if (!disposition) return null;
562
+ const match = disposition.match(/filename="?([^"]+)"?/i);
563
+ return match ? match[1] : null;
564
+ }
565
+
566
+ function handleFileUpload(event) {
567
+ const file = event.target.files?.[0];
568
+ if (!file) return;
569
+ state.uploadedFile = file;
570
+ state.audioUrl = null;
571
+ const objectUrl = URL.createObjectURL(file);
572
+ elements.audioPlayer.src = objectUrl;
573
+ setStatus(`Loaded ${file.name}`, 'info');
574
+ }
575
+
576
+ async function handleYoutubeFetch() {
577
+ if (!elements.youtubeUrl.value.trim()) return;
578
+ setStatus('Downloading audio from YouTube...', 'info');
579
+ try {
580
+ const res = await fetch('/api/youtube/fetch', {
581
+ method: 'POST',
582
+ headers: { 'Content-Type': 'application/json' },
583
+ body: JSON.stringify({ url: elements.youtubeUrl.value.trim() }),
584
+ });
585
+ if (!res.ok) throw new Error('YouTube download failed');
586
+ const data = await res.json();
587
+ state.audioUrl = data.audioUrl;
588
+ state.uploadedFile = null;
589
+ elements.audioPlayer.src = data.audioUrl;
590
+ setStatus('YouTube audio ready', 'success');
591
+ } catch (err) {
592
+ console.error(err);
593
+ setStatus(err.message, 'error');
594
+ }
595
+ }
596
+
597
+ async function handlePodcastSearch() {
598
+ const query = elements.podcastQuery.value.trim();
599
+ if (!query) return;
600
+ setStatus('Searching podcasts...', 'info');
601
+ setListEmpty(elements.podcastResults, 'Searching podcasts...');
602
+ setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
603
+ try {
604
+ const res = await fetch(`/api/podcast/search?query=${encodeURIComponent(query)}`);
605
+ if (!res.ok) throw new Error('Podcast search failed');
606
+ const series = await res.json();
607
+ if (!series.length) {
608
+ setListEmpty(elements.podcastResults, 'No podcasts match your search yet.');
609
+ return;
610
+ }
611
+ elements.podcastResults.innerHTML = '';
612
+ const fragment = document.createDocumentFragment();
613
+ series.forEach((item) => {
614
+ const div = document.createElement('div');
615
+ div.className = 'list-item';
616
+ div.innerHTML = `
617
+ <div>
618
+ <strong>${item.title}</strong><br/>
619
+ <span>${item.artist || 'Unknown artist'}</span>
620
+ </div>
621
+ <button data-feed="${item.feed_url}">Episodes</button>
622
+ `;
623
+ fragment.appendChild(div);
624
+ });
625
+ elements.podcastResults.appendChild(fragment);
626
+ setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
627
+ } catch (err) {
628
+ console.error(err);
629
+ setStatus(err.message, 'error');
630
+ setListEmpty(elements.podcastResults, 'Unable to load podcasts right now.');
631
+ }
632
+ }
633
+
634
+ async function loadEpisodes(feedUrl, sourceItem = null) {
635
+ setStatus('Loading episodes...', 'info');
636
+ if (sourceItem) {
637
+ elements.podcastResults.querySelectorAll('.list-item').forEach((item) => item.classList.remove('selected'));
638
+ sourceItem.classList.add('selected');
639
+ }
640
+ setListEmpty(elements.episodeResults, 'Loading episodes...');
641
+ try {
642
+ const res = await fetch(`/api/podcast/episodes?feed_url=${encodeURIComponent(feedUrl)}`);
643
+ if (!res.ok) throw new Error('Failed to load episodes');
644
+ const episodes = await res.json();
645
+ if (!episodes.length) {
646
+ setListEmpty(elements.episodeResults, 'No episodes available for this podcast.');
647
+ return;
648
+ }
649
+ elements.episodeResults.innerHTML = '';
650
+ const fragment = document.createDocumentFragment();
651
+ episodes.slice(0, 15).forEach((ep) => {
652
+ const div = document.createElement('div');
653
+ div.className = 'list-item';
654
+ div.innerHTML = `
655
+ <div>
656
+ <strong>${ep.title}</strong><br/>
657
+ <span>${ep.published || ''}</span>
658
+ </div>
659
+ <button data-url="${ep.audio_url}" data-title="${ep.title}">Download</button>
660
+ `;
661
+ fragment.appendChild(div);
662
+ });
663
+ elements.episodeResults.appendChild(fragment);
664
+ setStatus('Episodes ready', 'success');
665
+ } catch (err) {
666
+ console.error(err);
667
+ setStatus(err.message, 'error');
668
+ setListEmpty(elements.episodeResults, 'Unable to load episodes right now.');
669
+ }
670
+ }
671
+
672
+ async function downloadEpisode(audioUrl, title, triggerButton = null) {
673
+ setStatus('Downloading episode...', 'info');
674
+ let originalLabel = null;
675
+ if (triggerButton) {
676
+ originalLabel = triggerButton.innerHTML;
677
+ triggerButton.disabled = true;
678
+ triggerButton.classList.add('loading');
679
+ triggerButton.textContent = 'Downloading…';
680
+ }
681
+ try {
682
+ const res = await fetch('/api/podcast/download', {
683
+ method: 'POST',
684
+ headers: { 'Content-Type': 'application/json' },
685
+ body: JSON.stringify({ audioUrl, title }),
686
+ });
687
+ if (!res.ok) throw new Error('Episode download failed');
688
+ const data = await res.json();
689
+ state.audioUrl = data.audioUrl;
690
+ state.uploadedFile = null;
691
+ elements.audioPlayer.src = data.audioUrl;
692
+ setStatus('Episode ready', 'success');
693
+ if (triggerButton) {
694
+ triggerButton.textContent = 'Ready ✓';
695
+ triggerButton.classList.add('success');
696
+ }
697
+ } catch (err) {
698
+ console.error(err);
699
+ setStatus(err.message, 'error');
700
+ if (triggerButton) {
701
+ triggerButton.textContent = 'Retry';
702
+ triggerButton.classList.add('error');
703
+ }
704
+ } finally {
705
+ if (triggerButton) {
706
+ triggerButton.disabled = false;
707
+ triggerButton.classList.remove('loading');
708
+ setTimeout(() => {
709
+ triggerButton.classList.remove('success', 'error');
710
+ triggerButton.textContent = originalLabel || 'Download';
711
+ }, 2000);
712
+ }
713
+ }
714
+ }
715
+
716
+ function initPodcastInteractions() {
717
+ elements.podcastResults.addEventListener('click', (event) => {
718
+ const btn = event.target.closest('button[data-feed]');
719
+ if (!btn) return;
720
+ const listItem = btn.closest('.list-item');
721
+ loadEpisodes(btn.dataset.feed, listItem);
722
+ });
723
+
724
+ elements.episodeResults.addEventListener('click', (event) => {
725
+ const btn = event.target.closest('button[data-url]');
726
+ if (!btn) return;
727
+ downloadEpisode(btn.dataset.url, btn.dataset.title, btn);
728
+ });
729
+
730
+ }
731
+
732
+ function initEventBindings() {
733
+ elements.transcribeBtn.addEventListener('click', handleTranscription);
734
+ elements.summaryBtn.addEventListener('click', handleSummaryGeneration);
735
+ elements.exportTranscriptBtn.addEventListener('click', handleExportTranscript);
736
+ elements.exportSummaryBtn.addEventListener('click', handleExportSummary);
737
+ elements.fileInput.addEventListener('change', handleFileUpload);
738
+ elements.youtubeFetch.addEventListener('click', handleYoutubeFetch);
739
+ elements.podcastSearch.addEventListener('click', handlePodcastSearch);
740
+ elements.podcastQuery.addEventListener('keydown', (event) => {
741
+ if (event.key === 'Enter') {
742
+ event.preventDefault();
743
+ handlePodcastSearch();
744
+ }
745
+ });
746
+ }
747
+
748
+ async function init() {
749
+ initTabs();
750
+ initSidebarInteractions();
751
+ initAudioInteractions();
752
+ initEventBindings();
753
+ initPodcastInteractions();
754
+
755
+ elements.backendSelect.innerHTML = `
756
+ <option value="moonshine">Moonshine</option>
757
+ <option value="sensevoice" selected>SenseVoice</option>
758
+ `;
759
+ state.backend = elements.backendSelect.value;
760
+
761
+ setListEmpty(elements.podcastResults, 'Search to discover podcasts.');
762
+ setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
763
+
764
+ await fetchConfig();
765
+ setStatus('Ready');
766
+ }
767
+
768
+ init();
frontend/index.html ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>VoxSum Studio</title>
7
+ <link rel="stylesheet" href="/styles.css" />
8
+ </head>
9
+ <body>
10
+ <header class="app-header">
11
+ <h1>VoxSum Studio</h1>
12
+ <p class="tagline">Transform Audio into Insightful Summaries</p>
13
+ </header>
14
+ <div class="app-shell">
15
+ <aside class="sidebar">
16
+ <section class="panel">
17
+ <h2>ASR Settings</h2>
18
+ <label for="backend-select">Backend</label>
19
+ <select id="backend-select"></select>
20
+
21
+ <label for="model-select">Model</label>
22
+ <select id="model-select"></select>
23
+
24
+ <div id="sensevoice-options" class="conditional hidden">
25
+ <label for="sensevoice-language">Language</label>
26
+ <select id="sensevoice-language">
27
+ <option value="auto">Auto</option>
28
+ <option value="zh">Chinese</option>
29
+ <option value="en">English</option>
30
+ <option value="ja">Japanese</option>
31
+ <option value="ko">Korean</option>
32
+ <option value="yue">Cantonese</option>
33
+ </select>
34
+
35
+ <label>Text Normalization</label>
36
+ <div class="radio-group">
37
+ <label><input type="radio" name="textnorm" value="withitn" checked /> With ITN</label>
38
+ <label><input type="radio" name="textnorm" value="noitn" /> Raw</label>
39
+ </div>
40
+ </div>
41
+
42
+ <label for="vad-threshold">VAD Threshold</label>
43
+ <input id="vad-threshold" type="range" min="0.1" max="0.9" step="0.05" value="0.5" />
44
+ <span id="vad-value" class="hint">0.50</span>
45
+ </section>
46
+
47
+ <section class="panel">
48
+ <h2>Diarization</h2>
49
+ <label class="toggle">
50
+ <input id="diarization-toggle" type="checkbox" /> Enable speaker diarization
51
+ </label>
52
+ <div id="diarization-settings" class="conditional hidden">
53
+ <label for="num-speakers">Number of speakers (-1 = auto)</label>
54
+ <input id="num-speakers" type="number" min="-1" max="10" value="-1" />
55
+
56
+ <label for="cluster-threshold">Cluster threshold</label>
57
+ <input id="cluster-threshold" type="range" min="0.1" max="1" step="0.05" value="0.5" />
58
+ <span id="cluster-value" class="hint">0.50</span>
59
+ </div>
60
+ </section>
61
+
62
+ <section class="panel">
63
+ <h2>Summarization</h2>
64
+ <label for="llm-select">LLM Model</label>
65
+ <select id="llm-select"></select>
66
+
67
+ <label for="prompt-input">Custom Prompt</label>
68
+ <textarea id="prompt-input" rows="4">Summarize the transcript below.</textarea>
69
+ </section>
70
+ </aside>
71
+
72
+ <main class="content">
73
+ <nav class="tabs">
74
+ <button class="tab active" data-target="podcast-tab">🎙️ Podcast</button>
75
+ <button class="tab" data-target="audio-tab">🎵 Audio Input</button>
76
+ <button class="tab" data-target="results-tab">📄 Results</button>
77
+ </nav>
78
+
79
+ <section id="podcast-tab" class="tab-panel active">
80
+ <div class="panel">
81
+ <h2>Search Podcasts</h2>
82
+ <div class="form-row">
83
+ <input id="podcast-query" type="text" placeholder="Podcast title" />
84
+ <button id="podcast-search">Search</button>
85
+ </div>
86
+ <div class="list-grid">
87
+ <section class="list-section">
88
+ <header class="list-section-header">
89
+ <h3>Podcast Channels</h3>
90
+ <p class="list-hint">Pick a show to reveal recent episodes.</p>
91
+ </header>
92
+ <div id="podcast-results" class="list"></div>
93
+ </section>
94
+ <section class="list-section">
95
+ <header class="list-section-header">
96
+ <h3>Episodes</h3>
97
+ <p class="list-hint">Episodes for the selected podcast appear here.</p>
98
+ </header>
99
+ <div id="episode-results" class="list"></div>
100
+ </section>
101
+ </div>
102
+ </div>
103
+ </section>
104
+
105
+ <section id="audio-tab" class="tab-panel">
106
+ <div class="panel">
107
+ <h2>YouTube</h2>
108
+ <div class="form-row">
109
+ <input id="youtube-url" type="url" placeholder="https://youtube.com/..." />
110
+ <button id="youtube-fetch">Fetch Audio</button>
111
+ </div>
112
+ </div>
113
+ <div class="panel">
114
+ <h2>Upload Audio</h2>
115
+ <input id="file-input" type="file" accept="audio/*" />
116
+ </div>
117
+ </section>
118
+
119
+ <section id="results-tab" class="tab-panel">
120
+ <div class="actions">
121
+ <button id="transcribe-btn" class="primary">Transcribe Audio</button>
122
+ <button id="summary-btn" class="secondary">Generate Summary</button>
123
+ <span id="status-text" class="status-text">Ready</span>
124
+ </div>
125
+
126
+ <section class="panel">
127
+ <h2>Audio Player</h2>
128
+ <audio id="audio-player" controls preload="auto"></audio>
129
+ </section>
130
+
131
+ <section class="panel">
132
+ <div class="panel-header">
133
+ <h2>Transcript</h2>
134
+ <span id="utterance-count" class="hint"></span>
135
+ </div>
136
+ <div id="transcript-container">
137
+ <ul id="transcript-list"></ul>
138
+ </div>
139
+ </section>
140
+
141
+ <section id="diarization-summary" class="panel hidden">
142
+ <h2>Speaker Analysis</h2>
143
+ <div id="diarization-metrics"></div>
144
+ <div id="speaker-breakdown"></div>
145
+ </section>
146
+
147
+ <section class="panel">
148
+ <h2>Summary</h2>
149
+ <div id="summary-output" class="summary"></div>
150
+ </section>
151
+
152
+ <section class="panel">
153
+ <h2>Export</h2>
154
+ <div class="export-grid">
155
+ <div>
156
+ <label for="transcript-format">Transcript format</label>
157
+ <select id="transcript-format"></select>
158
+ </div>
159
+ <div>
160
+ <label class="toggle">
161
+ <input id="include-timestamps" type="checkbox" checked /> Include timestamps
162
+ </label>
163
+ </div>
164
+ <button id="export-transcript">Export Transcript</button>
165
+ <div>
166
+ <label for="summary-format">Summary format</label>
167
+ <select id="summary-format"></select>
168
+ </div>
169
+ <button id="export-summary">Export Summary</button>
170
+ </div>
171
+ </section>
172
+ </section>
173
+ </main>
174
+ </div>
175
+
176
+ <template id="utterance-template">
177
+ <li class="utterance-item">
178
+ <div class="utterance-header">
179
+ <span class="timestamp"></span>
180
+ <span class="speaker-tag hidden"></span>
181
+ <div class="utterance-actions">
182
+ <button class="edit-btn" title="Edit">✏️</button>
183
+ </div>
184
+ </div>
185
+ <div class="utterance-text"></div>
186
+ <div class="edit-area hidden">
187
+ <textarea rows="3"></textarea>
188
+ <div class="edit-controls">
189
+ <button class="save-edit">Save</button>
190
+ <button class="cancel-edit">Cancel</button>
191
+ </div>
192
+ </div>
193
+ </li>
194
+ </template>
195
+
196
+ <script src="/app.js" type="module"></script>
197
+ </body>
198
+ </html>
frontend/styles.css ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ * {
2
+ box-sizing: border-box;
3
+ }
4
+
5
+ body {
6
+ margin: 0;
7
+ font-family: 'Inter', 'Segoe UI', sans-serif;
8
+ background: linear-gradient(180deg, #0f172a 0%, #111827 100%);
9
+ color: #e5e7eb;
10
+ min-height: 100vh;
11
+ }
12
+
13
+ .app-header {
14
+ padding: 2rem 3rem 1.5rem;
15
+ background: rgba(15, 23, 42, 0.8);
16
+ backdrop-filter: blur(10px);
17
+ border-bottom: 1px solid rgba(148, 163, 184, 0.2);
18
+ }
19
+
20
+ .app-header h1 {
21
+ margin: 0;
22
+ font-size: 2.5rem;
23
+ letter-spacing: 0.05em;
24
+ }
25
+
26
+ .app-header .tagline {
27
+ margin: 0.5rem 0 0;
28
+ color: #94a3b8;
29
+ }
30
+
31
+ .app-shell {
32
+ display: grid;
33
+ grid-template-columns: 320px 1fr;
34
+ gap: 1.5rem;
35
+ padding: 1.5rem 2rem 3rem;
36
+ }
37
+
38
+ .sidebar {
39
+ display: flex;
40
+ flex-direction: column;
41
+ gap: 1.5rem;
42
+ }
43
+
44
+ .panel {
45
+ background: rgba(30, 41, 59, 0.7);
46
+ border: 1px solid rgba(148, 163, 184, 0.15);
47
+ border-radius: 16px;
48
+ padding: 1.25rem;
49
+ box-shadow: 0 20px 45px rgba(15, 23, 42, 0.35);
50
+ }
51
+
52
+ .panel h2 {
53
+ margin: 0 0 1rem;
54
+ font-size: 1.1rem;
55
+ letter-spacing: 0.02em;
56
+ }
57
+
58
+ .panel-header {
59
+ display: flex;
60
+ align-items: center;
61
+ justify-content: space-between;
62
+ margin-bottom: 0.5rem;
63
+ }
64
+
65
+ label {
66
+ display: block;
67
+ font-size: 0.9rem;
68
+ margin-bottom: 0.35rem;
69
+ color: #cbd5f5;
70
+ }
71
+
72
+ input[type="text"],
73
+ input[type="url"],
74
+ input[type="number"],
75
+ select,
76
+ textarea {
77
+ width: 100%;
78
+ padding: 0.6rem 0.75rem;
79
+ border-radius: 10px;
80
+ border: 1px solid rgba(148, 163, 184, 0.2);
81
+ background: rgba(15, 23, 42, 0.6);
82
+ color: #e5e7eb;
83
+ font: inherit;
84
+ transition: border-color 0.2s ease, box-shadow 0.2s ease;
85
+ }
86
+
87
+ input:focus,
88
+ select:focus,
89
+ textarea:focus {
90
+ outline: none;
91
+ border-color: #38bdf8;
92
+ box-shadow: 0 0 0 2px rgba(56, 189, 248, 0.15);
93
+ }
94
+
95
+ textarea {
96
+ resize: vertical;
97
+ }
98
+
99
+ input[type="range"] {
100
+ width: 100%;
101
+ margin: 0.5rem 0;
102
+ }
103
+
104
+ .hint {
105
+ font-size: 0.8rem;
106
+ color: #94a3b8;
107
+ }
108
+
109
+ .toggle {
110
+ display: flex;
111
+ align-items: center;
112
+ gap: 0.6rem;
113
+ font-size: 0.9rem;
114
+ }
115
+
116
+ .radio-group {
117
+ display: flex;
118
+ gap: 0.75rem;
119
+ margin-bottom: 0.5rem;
120
+ }
121
+
122
+ .radio-group input {
123
+ margin-right: 0.35rem;
124
+ }
125
+
126
+ .content {
127
+ display: flex;
128
+ flex-direction: column;
129
+ gap: 1.5rem;
130
+ }
131
+
132
+ .tabs {
133
+ display: inline-flex;
134
+ background: rgba(30, 41, 59, 0.6);
135
+ border-radius: 999px;
136
+ padding: 0.4rem;
137
+ width: fit-content;
138
+ border: 1px solid rgba(148, 163, 184, 0.2);
139
+ }
140
+
141
+ .tab {
142
+ border: none;
143
+ background: transparent;
144
+ color: #94a3b8;
145
+ padding: 0.6rem 1.2rem;
146
+ border-radius: 999px;
147
+ font: inherit;
148
+ cursor: pointer;
149
+ transition: all 0.2s ease;
150
+ }
151
+
152
+ .tab.active {
153
+ background: linear-gradient(135deg, #38bdf8 0%, #818cf8 100%);
154
+ color: #0f172a;
155
+ font-weight: 600;
156
+ }
157
+
158
+ .tab-panel {
159
+ display: none;
160
+ }
161
+
162
+ .tab-panel.active {
163
+ display: block;
164
+ }
165
+
166
+ .form-row {
167
+ display: flex;
168
+ gap: 0.75rem;
169
+ }
170
+
171
+ .form-row input {
172
+ flex: 1;
173
+ }
174
+
175
+ .list {
176
+ margin-top: 1rem;
177
+ display: grid;
178
+ gap: 0.75rem;
179
+ }
180
+
181
+ .list-grid {
182
+ margin-top: 1.5rem;
183
+ display: grid;
184
+ gap: 1.25rem;
185
+ grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
186
+ align-items: start;
187
+ }
188
+
189
+ .list-section {
190
+ background: rgba(15, 23, 42, 0.4);
191
+ border: 1px solid rgba(148, 163, 184, 0.18);
192
+ border-radius: 16px;
193
+ padding: 1rem;
194
+ display: flex;
195
+ flex-direction: column;
196
+ gap: 0.75rem;
197
+ }
198
+
199
+ .list-section-header {
200
+ display: flex;
201
+ flex-direction: column;
202
+ gap: 0.35rem;
203
+ }
204
+
205
+ .list-section-header h3 {
206
+ margin: 0;
207
+ font-size: 1rem;
208
+ letter-spacing: 0.02em;
209
+ }
210
+
211
+ .list-hint {
212
+ margin: 0;
213
+ font-size: 0.85rem;
214
+ color: #9ca3af;
215
+ }
216
+
217
+ .list-item {
218
+ padding: 0.75rem;
219
+ border-radius: 12px;
220
+ background: rgba(15, 23, 42, 0.55);
221
+ border: 1px solid rgba(148, 163, 184, 0.15);
222
+ display: flex;
223
+ justify-content: space-between;
224
+ align-items: center;
225
+ gap: 1rem;
226
+ }
227
+
228
+ .list-item button {
229
+ flex-shrink: 0;
230
+ display: inline-flex;
231
+ align-items: center;
232
+ gap: 0.45rem;
233
+ transition: all 0.2s ease;
234
+ }
235
+
236
+ .list-item button.loading {
237
+ pointer-events: none;
238
+ opacity: 0.75;
239
+ }
240
+
241
+ .list-item button.loading::before {
242
+ content: '';
243
+ width: 0.9rem;
244
+ height: 0.9rem;
245
+ border-radius: 50%;
246
+ border: 2px solid rgba(148, 163, 184, 0.4);
247
+ border-top-color: #38bdf8;
248
+ animation: spin 0.8s linear infinite;
249
+ }
250
+
251
+ .list-item button.success {
252
+ background: rgba(34, 197, 94, 0.18);
253
+ border-color: rgba(34, 197, 94, 0.35);
254
+ color: #86efac;
255
+ }
256
+
257
+ .list-item button.error {
258
+ background: rgba(248, 113, 113, 0.18);
259
+ border-color: rgba(248, 113, 113, 0.35);
260
+ color: #fca5a5;
261
+ }
262
+
263
+ @keyframes spin {
264
+ to {
265
+ transform: rotate(360deg);
266
+ }
267
+ }
268
+
269
+ .list-item.selected {
270
+ border-color: rgba(56, 189, 248, 0.6);
271
+ box-shadow: 0 0 0 2px rgba(56, 189, 248, 0.15);
272
+ }
273
+
274
+ .empty-state {
275
+ padding: 1rem;
276
+ text-align: center;
277
+ border: 1px dashed rgba(148, 163, 184, 0.25);
278
+ border-radius: 12px;
279
+ color: #94a3b8;
280
+ font-size: 0.9rem;
281
+ background: rgba(15, 23, 42, 0.35);
282
+ }
283
+
284
+ .actions {
285
+ display: flex;
286
+ gap: 1rem;
287
+ align-items: center;
288
+ }
289
+
290
+ button {
291
+ border: none;
292
+ border-radius: 10px;
293
+ padding: 0.65rem 1.1rem;
294
+ font: inherit;
295
+ cursor: pointer;
296
+ color: #0f172a;
297
+ background: #e2e8f0;
298
+ transition: transform 0.2s ease, box-shadow 0.2s ease;
299
+ }
300
+
301
+ button.primary {
302
+ background: linear-gradient(135deg, #38bdf8 0%, #818cf8 100%);
303
+ color: #0f172a;
304
+ font-weight: 600;
305
+ }
306
+
307
+ button.secondary {
308
+ background: rgba(148, 163, 184, 0.2);
309
+ color: #e5e7eb;
310
+ }
311
+
312
+ button:hover {
313
+ transform: translateY(-1px);
314
+ box-shadow: 0 10px 25px rgba(15, 23, 42, 0.25);
315
+ }
316
+
317
+ .status-text {
318
+ color: #eab308;
319
+ font-size: 0.9rem;
320
+ }
321
+
322
+ #transcript-container {
323
+ max-height: 420px;
324
+ overflow: auto;
325
+ border-radius: 12px;
326
+ background: rgba(15, 23, 42, 0.4);
327
+ border: 1px solid rgba(148, 163, 184, 0.15);
328
+ }
329
+
330
+ #transcript-list {
331
+ list-style: none;
332
+ padding: 0;
333
+ margin: 0;
334
+ }
335
+
336
+ .utterance-item {
337
+ padding: 0.85rem 1rem;
338
+ border-bottom: 1px solid rgba(148, 163, 184, 0.1);
339
+ transition: background 0.2s ease;
340
+ }
341
+
342
+ .utterance-item:last-child {
343
+ border-bottom: none;
344
+ }
345
+
346
+ .utterance-item.active {
347
+ background: rgba(56, 189, 248, 0.15);
348
+ border-left: 3px solid #38bdf8;
349
+ }
350
+
351
+ .utterance-header {
352
+ display: flex;
353
+ align-items: center;
354
+ gap: 0.75rem;
355
+ }
356
+
357
+ .timestamp {
358
+ font-size: 0.8rem;
359
+ color: #94a3b8;
360
+ min-width: 70px;
361
+ }
362
+
363
+ .speaker-tag {
364
+ font-size: 0.75rem;
365
+ padding: 0.1rem 0.5rem;
366
+ border-radius: 999px;
367
+ background: rgba(129, 140, 248, 0.2);
368
+ }
369
+
370
+ .utterance-actions {
371
+ margin-left: auto;
372
+ display: flex;
373
+ gap: 0.5rem;
374
+ }
375
+
376
+ .edit-btn {
377
+ background: rgba(148, 163, 184, 0.2);
378
+ color: #e5e7eb;
379
+ padding: 0.3rem 0.6rem;
380
+ font-size: 0.85rem;
381
+ }
382
+
383
+ .utterance-text {
384
+ margin-top: 0.4rem;
385
+ line-height: 1.5;
386
+ }
387
+
388
+ .edit-area {
389
+ margin-top: 0.6rem;
390
+ display: grid;
391
+ gap: 0.5rem;
392
+ }
393
+
394
+ .edit-area textarea {
395
+ width: 100%;
396
+ }
397
+
398
+ .edit-controls {
399
+ display: flex;
400
+ gap: 0.5rem;
401
+ }
402
+
403
+ .summary {
404
+ min-height: 120px;
405
+ background: rgba(15, 23, 42, 0.5);
406
+ border-radius: 12px;
407
+ padding: 1rem;
408
+ border: 1px solid rgba(148, 163, 184, 0.15);
409
+ white-space: pre-wrap;
410
+ }
411
+
412
+ .export-grid {
413
+ display: grid;
414
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
415
+ gap: 0.75rem;
416
+ align-items: end;
417
+ }
418
+
419
+ #diarization-metrics,
420
+ #speaker-breakdown {
421
+ display: grid;
422
+ gap: 0.75rem;
423
+ }
424
+
425
+ .metric-card {
426
+ padding: 0.75rem;
427
+ border-radius: 12px;
428
+ background: rgba(15, 23, 42, 0.5);
429
+ border: 1px solid rgba(148, 163, 184, 0.1);
430
+ }
431
+
432
+ .hidden {
433
+ display: none !important;
434
+ }
435
+
436
+ @media (max-width: 1100px) {
437
+ .app-shell {
438
+ grid-template-columns: 1fr;
439
+ }
440
+
441
+ .sidebar {
442
+ order: 2;
443
+ }
444
+
445
+ .content {
446
+ order: 1;
447
+ }
448
+ }
requirements.txt CHANGED
@@ -1,7 +1,4 @@
1
  --extra-index-url https://download.pytorch.org/whl/cpu
2
- altair
3
- pandas
4
- streamlit
5
  numpy<2.0
6
  soundfile
7
  onnxruntime
@@ -16,4 +13,9 @@ ffmpeg-python
16
  feedparser
17
  sherpa_onnx
18
  huggingface_hub
19
- faiss-cpu
 
 
 
 
 
 
1
  --extra-index-url https://download.pytorch.org/whl/cpu
 
 
 
2
  numpy<2.0
3
  soundfile
4
  onnxruntime
 
13
  feedparser
14
  sherpa_onnx
15
  huggingface_hub
16
+ faiss-cpu
17
+ fastapi
18
+ uvicorn[standard]
19
+ python-multipart
20
+ jinja2
21
+ aiofiles
src/asr.py CHANGED
@@ -1,11 +1,12 @@
1
  # asr.py
 
 
 
 
 
2
  import numpy as np
3
  import soundfile as sf
4
  from scipy.signal import resample_poly
5
- import re
6
- from typing import Optional, Tuple, List
7
- import tempfile
8
- import os
9
 
10
  # Lazy / optional imports: guard heavy or optional ASR backends
11
  try:
@@ -20,11 +21,7 @@ except Exception:
20
  MoonshineOnnxModel = None
21
  load_tokenizer = None
22
 
23
- from utils import s2tw_converter, load_sensevoice_model
24
- import re
25
- from typing import Optional, Tuple, List
26
- import tempfile
27
- import os
28
 
29
  SAMPLING_RATE = 16000
30
  CHUNK_SIZE = 512
@@ -44,8 +41,8 @@ def transcribe_file(
44
  model_name: str,
45
  backend: str = "moonshine",
46
  language: str = "auto",
47
- textnorm: str = "withitn"
48
- ) -> Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]]]:
49
  """
50
  Transcribe audio file using specified backend.
51
 
 
1
  # asr.py
2
+ import os
3
+ import re
4
+ import tempfile
5
+ from typing import Iterable, List, Optional, Tuple
6
+
7
  import numpy as np
8
  import soundfile as sf
9
  from scipy.signal import resample_poly
 
 
 
 
10
 
11
  # Lazy / optional imports: guard heavy or optional ASR backends
12
  try:
 
21
  MoonshineOnnxModel = None
22
  load_tokenizer = None
23
 
24
+ from .utils import load_sensevoice_model, s2tw_converter
 
 
 
 
25
 
26
  SAMPLING_RATE = 16000
27
  CHUNK_SIZE = 512
 
41
  model_name: str,
42
  backend: str = "moonshine",
43
  language: str = "auto",
44
+ textnorm: str = "withitn",
45
+ ) -> Iterable[Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]]]]:
46
  """
47
  Transcribe audio file using specified backend.
48
 
src/diarization.py CHANGED
@@ -17,12 +17,11 @@ import numpy as np
17
  import sherpa_onnx
18
  from pathlib import Path
19
  from typing import List, Tuple, Optional, Callable, Dict, Any
20
- import streamlit as st
21
  import logging
22
- from utils import get_writable_model_dir
23
- from utils import num_vcpus
24
  from huggingface_hub import hf_hub_download
25
  import shutil
 
26
 
27
  # Import the improved diarization pipeline (robust: search repo tree)
28
  try:
@@ -95,11 +94,11 @@ def download_diarization_models():
95
  repo_id = "csukuangfj/speaker-embedding-models"
96
  filename = "3dspeaker_speech_campplus_sv_zh_en_16k-common_advanced.onnx"
97
  embedding_model = models_dir / filename
98
- st.info(f"Model cache directory: {models_dir}")
99
  try:
100
  # Download using huggingface_hub if not present
101
  if not embedding_model.exists():
102
- st.info("📥 Downloading eres2netv2 Chinese speaker model from HuggingFace (29MB)...")
103
  downloaded_path = hf_hub_download(
104
  repo_id=repo_id,
105
  filename=filename,
@@ -111,10 +110,10 @@ def download_diarization_models():
111
  # Move/copy to expected location if needed
112
  if Path(downloaded_path) != embedding_model:
113
  shutil.copy(downloaded_path, embedding_model)
114
- st.success("✅ eres2netv2 Chinese embedding model downloaded!")
115
  return str(embedding_model), True
116
  except Exception as e:
117
- st.error(f"❌ Failed to download diarization models: {e}")
118
  return None, False
119
 
120
  def init_speaker_embedding_extractor(
@@ -137,26 +136,26 @@ def init_speaker_embedding_extractor(
137
  embedding_model, success = download_diarization_models()
138
  if not success:
139
  return None
140
-
141
  # Create embedding extractor config
142
  embedding_config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
143
  model=embedding_model,
144
  num_threads=num_vcpus
145
  )
146
-
147
  # Initialize embedding extractor
148
  embedding_extractor = sherpa_onnx.SpeakerEmbeddingExtractor(embedding_config)
149
-
150
  # Store clustering parameters separately
151
  config_dict = {
152
  'cluster_threshold': cluster_threshold,
153
  'num_speakers': num_speakers
154
  }
155
-
156
  return embedding_extractor, config_dict
157
-
158
  except Exception as e:
159
- st.error(f"❌ Failed to initialize speaker embedding extractor: {e}")
160
  return None
161
 
162
  def perform_speaker_diarization_on_utterances(
@@ -195,19 +194,13 @@ def perform_speaker_diarization_on_utterances(
195
  # Check sample rate
196
  if sample_rate != 16000:
197
  warning_msg = f"⚠️ Audio sample rate is {sample_rate}Hz, but 16kHz is optimal for diarization"
198
- if hasattr(st, '_is_running_with_streamlit') and st._is_running_with_streamlit:
199
- st.warning(warning_msg)
200
- print(warning_msg)
201
 
202
  if not utterances:
203
- if hasattr(st, '_is_running_with_streamlit') and st._is_running_with_streamlit:
204
- st.warning("⚠️ No utterances provided for diarization")
205
- print("⚠️ No utterances provided for diarization")
206
  return []
207
 
208
- if hasattr(st, '_is_running_with_streamlit') and st._is_running_with_streamlit:
209
- st.info(f"🎭 Extracting embeddings from {len(utterances)} utterance segments...")
210
- print(f"🎭 Extracting embeddings from {len(utterances)} utterance segments...")
211
 
212
  # Extract embeddings for each utterance segment
213
  embeddings = []
@@ -258,12 +251,12 @@ def perform_speaker_diarization_on_utterances(
258
  continue
259
 
260
  if not embeddings:
261
- st.error("❌ No valid embeddings extracted")
262
  print(f"❌ DEBUG: Failed to extract any embeddings from {len(utterances)} utterances")
263
  return []
264
 
265
  print(f"✅ DEBUG: Extracted {len(embeddings)} embeddings for clustering")
266
- st.info(f"✅ Extracted {len(embeddings)} embeddings, performing clustering...")
267
 
268
  # Convert embeddings to numpy array
269
  embeddings_array = np.array(embeddings)
@@ -272,7 +265,7 @@ def perform_speaker_diarization_on_utterances(
272
  # Use enhanced diarization if available
273
  if ENHANCED_DIARIZATION_AVAILABLE:
274
  print("🚀 Using enhanced diarization with adaptive clustering...")
275
- st.info("🚀 Using enhanced adaptive clustering...")
276
 
277
  # Prepare utterances dict format for enhanced pipeline
278
  utterances_dict = []
@@ -300,11 +293,11 @@ def perform_speaker_diarization_on_utterances(
300
 
301
  quality_msg = f"🎯 Diarization Quality: {confidence} confidence ({quality})"
302
  if quality in ['excellent', 'good']:
303
- st.success(quality_msg)
304
  elif quality == 'fair':
305
- st.warning(quality_msg)
306
  else:
307
- st.error(quality_msg)
308
 
309
  print(f"✅ Enhanced diarization quality report:")
310
  print(f" - Quality: {quality}")
@@ -314,7 +307,7 @@ def perform_speaker_diarization_on_utterances(
314
  print(f" - Speakers detected: {n_speakers}")
315
 
316
  if quality_report['recommendations']:
317
- st.info("💡 " + "; ".join(quality_report['recommendations']))
318
 
319
  # Convert back to tuple format
320
  diarization_result = []
@@ -325,17 +318,17 @@ def perform_speaker_diarization_on_utterances(
325
  progress_callback(1.0) # 100% complete
326
 
327
  print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
328
- st.success(f"🎭 Enhanced clustering completed! Detected {n_speakers} speakers with {confidence} confidence")
329
 
330
  return diarization_result
331
 
332
  except Exception as e:
333
- st.error(f"❌ Enhanced diarization failed: {e}")
334
  print(f"❌ Enhanced diarization failed: {e}")
335
  # Fall back to original clustering
336
 
337
  # Fallback to original clustering
338
- st.warning("⚠️ Using fallback clustering")
339
  print("⚠️ Using fallback clustering")
340
 
341
  # >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
@@ -349,8 +342,6 @@ def perform_speaker_diarization_on_utterances(
349
  print(error_msg)
350
  import traceback
351
  traceback.print_exc()
352
- if hasattr(st, '_is_running_with_streamlit') and st._is_running_with_streamlit:
353
- st.error(error_msg)
354
  return []
355
 
356
  def merge_transcription_with_diarization(
@@ -555,7 +546,7 @@ def faiss_clustering(embeddings: np.ndarray,
555
 
556
  num_speakers = len(set(labels))
557
  print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
558
- st.success(f"🎭 FAISS clustering completed! Detected {num_speakers} speakers")
559
 
560
  return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]
561
 
 
17
  import sherpa_onnx
18
  from pathlib import Path
19
  from typing import List, Tuple, Optional, Callable, Dict, Any
 
20
  import logging
21
+ from .utils import get_writable_model_dir, num_vcpus
 
22
  from huggingface_hub import hf_hub_download
23
  import shutil
24
+ from sklearn.metrics import silhouette_score
25
 
26
  # Import the improved diarization pipeline (robust: search repo tree)
27
  try:
 
94
  repo_id = "csukuangfj/speaker-embedding-models"
95
  filename = "3dspeaker_speech_campplus_sv_zh_en_16k-common_advanced.onnx"
96
  embedding_model = models_dir / filename
97
+ logger.info(f"Model cache directory: {models_dir}")
98
  try:
99
  # Download using huggingface_hub if not present
100
  if not embedding_model.exists():
101
+ logger.info("📥 Downloading eres2netv2 Chinese speaker model from HuggingFace (29MB)...")
102
  downloaded_path = hf_hub_download(
103
  repo_id=repo_id,
104
  filename=filename,
 
110
  # Move/copy to expected location if needed
111
  if Path(downloaded_path) != embedding_model:
112
  shutil.copy(downloaded_path, embedding_model)
113
+ logger.info("✅ eres2netv2 Chinese embedding model downloaded!")
114
  return str(embedding_model), True
115
  except Exception as e:
116
+ logger.error(f"❌ Failed to download diarization models: {e}")
117
  return None, False
118
 
119
  def init_speaker_embedding_extractor(
 
136
  embedding_model, success = download_diarization_models()
137
  if not success:
138
  return None
139
+
140
  # Create embedding extractor config
141
  embedding_config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
142
  model=embedding_model,
143
  num_threads=num_vcpus
144
  )
145
+
146
  # Initialize embedding extractor
147
  embedding_extractor = sherpa_onnx.SpeakerEmbeddingExtractor(embedding_config)
148
+
149
  # Store clustering parameters separately
150
  config_dict = {
151
  'cluster_threshold': cluster_threshold,
152
  'num_speakers': num_speakers
153
  }
154
+
155
  return embedding_extractor, config_dict
156
+
157
  except Exception as e:
158
+ logger.error(f"❌ Failed to initialize speaker embedding extractor: {e}")
159
  return None
160
 
161
  def perform_speaker_diarization_on_utterances(
 
194
  # Check sample rate
195
  if sample_rate != 16000:
196
  warning_msg = f"⚠️ Audio sample rate is {sample_rate}Hz, but 16kHz is optimal for diarization"
197
+ logger.warning(warning_msg)
 
 
198
 
199
  if not utterances:
200
+ logger.warning("⚠️ No utterances provided for diarization")
 
 
201
  return []
202
 
203
+ logger.info(f"🎭 Extracting embeddings from {len(utterances)} utterance segments...")
 
 
204
 
205
  # Extract embeddings for each utterance segment
206
  embeddings = []
 
251
  continue
252
 
253
  if not embeddings:
254
+ logger.error("❌ No valid embeddings extracted")
255
  print(f"❌ DEBUG: Failed to extract any embeddings from {len(utterances)} utterances")
256
  return []
257
 
258
  print(f"✅ DEBUG: Extracted {len(embeddings)} embeddings for clustering")
259
+ logger.info(f"✅ Extracted {len(embeddings)} embeddings, performing clustering...")
260
 
261
  # Convert embeddings to numpy array
262
  embeddings_array = np.array(embeddings)
 
265
  # Use enhanced diarization if available
266
  if ENHANCED_DIARIZATION_AVAILABLE:
267
  print("🚀 Using enhanced diarization with adaptive clustering...")
268
+ logger.info("🚀 Using enhanced adaptive clustering...")
269
 
270
  # Prepare utterances dict format for enhanced pipeline
271
  utterances_dict = []
 
293
 
294
  quality_msg = f"🎯 Diarization Quality: {confidence} confidence ({quality})"
295
  if quality in ['excellent', 'good']:
296
+ logger.info(quality_msg)
297
  elif quality == 'fair':
298
+ logger.warning(quality_msg)
299
  else:
300
+ logger.error(quality_msg)
301
 
302
  print(f"✅ Enhanced diarization quality report:")
303
  print(f" - Quality: {quality}")
 
307
  print(f" - Speakers detected: {n_speakers}")
308
 
309
  if quality_report['recommendations']:
310
+ logger.info("💡 " + "; ".join(quality_report['recommendations']))
311
 
312
  # Convert back to tuple format
313
  diarization_result = []
 
318
  progress_callback(1.0) # 100% complete
319
 
320
  print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
321
+ logger.info(f"🎭 Enhanced clustering completed! Detected {n_speakers} speakers with {confidence} confidence")
322
 
323
  return diarization_result
324
 
325
  except Exception as e:
326
+ logger.error(f"❌ Enhanced diarization failed: {e}")
327
  print(f"❌ Enhanced diarization failed: {e}")
328
  # Fall back to original clustering
329
 
330
  # Fallback to original clustering
331
+ logger.warning("⚠️ Using fallback clustering")
332
  print("⚠️ Using fallback clustering")
333
 
334
  # >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
 
342
  print(error_msg)
343
  import traceback
344
  traceback.print_exc()
 
 
345
  return []
346
 
347
  def merge_transcription_with_diarization(
 
546
 
547
  num_speakers = len(set(labels))
548
  print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
549
+ logger.info(f"🎭 FAISS clustering completed! Detected {num_speakers} speakers")
550
 
551
  return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]
552
 
src/server/__init__.py ADDED
File without changes
src/server/core/config.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from functools import lru_cache
6
+
7
+
8
+ BASE_DIR = Path(__file__).resolve().parents[3]
9
+ STATIC_DIR = BASE_DIR / "static"
10
+ AUDIO_DIR = STATIC_DIR / "audio"
11
+ MODEL_CACHE_DIR = BASE_DIR / "tmp" / "models"
12
+ FRONTEND_DIR = BASE_DIR / "frontend"
13
+ TMP_DIR = BASE_DIR / "tmp"
14
+
15
+ # Ensure required directories exist
16
+ for directory in (STATIC_DIR, AUDIO_DIR, MODEL_CACHE_DIR, TMP_DIR, FRONTEND_DIR):
17
+ directory.mkdir(parents=True, exist_ok=True)
18
+
19
+
20
+ class Settings:
21
+ app_name: str = "VoxSum Studio API"
22
+ static_dir: Path = STATIC_DIR
23
+ audio_dir: Path = AUDIO_DIR
24
+ frontend_dir: Path = FRONTEND_DIR
25
+ tmp_dir: Path = TMP_DIR
26
+ model_cache_dir: Path = MODEL_CACHE_DIR
27
+ max_audio_files: int = int(os.environ.get("VOXSUM_MAX_AUDIO_FILES", "20"))
28
+ transcription_chunk_size: int = 100
29
+
30
+
31
+ @lru_cache(maxsize=1)
32
+ def get_settings() -> Settings:
33
+ return Settings()
src/server/main.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from fastapi import FastAPI
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from fastapi.staticfiles import StaticFiles
6
+
7
+ from .core.config import get_settings
8
+ from .routers.api import router as api_router
9
+
10
+ settings = get_settings()
11
+
12
+ app = FastAPI(title=settings.app_name)
13
+
14
+ app.add_middleware(
15
+ CORSMiddleware,
16
+ allow_origins=["*"],
17
+ allow_credentials=True,
18
+ allow_methods=["*"] ,
19
+ allow_headers=["*"],
20
+ )
21
+
22
+ app.include_router(api_router)
23
+
24
+ app.mount("/static", StaticFiles(directory=settings.static_dir), name="static")
25
+ app.mount("/media", StaticFiles(directory=settings.audio_dir), name="media")
26
+ app.mount("/", StaticFiles(directory=settings.frontend_dir, html=True), name="frontend")
27
+
28
+
29
+ @app.get("/health")
30
+ def healthcheck() -> dict[str, str]:
31
+ return {"status": "ok"}
src/server/routers/api.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from fastapi import APIRouter, File, Form, HTTPException, UploadFile
7
+ from fastapi.responses import StreamingResponse
8
+
9
+ from ..models.export import SummaryExportRequest, TranscriptExportRequest
10
+ from ..models.summarization import SummaryRequest
11
+ from ..models.transcription import TranscriptionRequest
12
+ from ..core.config import get_settings
13
+ from ..services import config_service, export_service, podcast_service
14
+ from ..services.asr_service import iter_transcription_events
15
+ from ..services.file_service import save_upload_file, store_audio_file
16
+ from ..services.summarization_service import iter_summary_events
17
+
18
+ router = APIRouter(prefix="/api")
19
+
20
+
21
+ @router.get("/config/models")
22
+ def fetch_model_catalog():
23
+ return config_service.get_model_catalog()
24
+
25
+
26
+ @router.post("/transcribe")
27
+ def transcribe_audio(
28
+ audio: UploadFile | None = File(default=None),
29
+ options: str = Form("{}"),
30
+ source: str | None = Form(default=None),
31
+ ):
32
+ payload = TranscriptionRequest(**json.loads(options or "{}"))
33
+
34
+ cleanup_temp = False
35
+ if audio is not None:
36
+ temp_path = save_upload_file(audio)
37
+ _, audio_url = store_audio_file(temp_path)
38
+ cleanup_temp = True
39
+ elif source:
40
+ filename = Path(source).name
41
+ candidate_path = get_settings().audio_dir / filename
42
+ if not candidate_path.exists():
43
+ raise HTTPException(status_code=404, detail="Audio source not found")
44
+ temp_path = candidate_path
45
+ audio_url = source
46
+ else:
47
+ raise HTTPException(status_code=400, detail="Either audio upload or source is required")
48
+
49
+ def event_stream():
50
+ try:
51
+ for event in iter_transcription_events(temp_path, audio_url, payload):
52
+ yield json.dumps(event, ensure_ascii=False) + "\n"
53
+ finally:
54
+ if cleanup_temp:
55
+ temp_path.unlink(missing_ok=True)
56
+
57
+ return StreamingResponse(event_stream(), media_type="application/x-ndjson")
58
+
59
+
60
+ @router.post("/summarize")
61
+ def summarize_text(request: SummaryRequest):
62
+ def event_stream():
63
+ for event in iter_summary_events(request):
64
+ yield json.dumps(event, ensure_ascii=False) + "\n"
65
+
66
+ return StreamingResponse(event_stream(), media_type="application/x-ndjson")
67
+
68
+
69
+ @router.get("/podcast/search")
70
+ def search_podcast(query: str):
71
+ return podcast_service.search_series(query)
72
+
73
+
74
+ @router.get("/podcast/episodes")
75
+ def get_podcast_episodes(feed_url: str):
76
+ return podcast_service.list_episodes(feed_url)
77
+
78
+
79
+ @router.post("/podcast/download")
80
+ def download_episode(payload: dict):
81
+ audio_url = payload.get("audioUrl") or payload.get("audio_url")
82
+ title = payload.get("title", "Episode")
83
+ if not audio_url:
84
+ raise HTTPException(status_code=400, detail="audioUrl is required")
85
+ return podcast_service.download_episode(audio_url, title)
86
+
87
+
88
+ @router.post("/youtube/fetch")
89
+ def fetch_youtube_audio(payload: dict):
90
+ url = payload.get("url") or payload.get("youtubeUrl")
91
+ if not url:
92
+ raise HTTPException(status_code=400, detail="url is required")
93
+ return podcast_service.fetch_youtube_audio(url)
94
+
95
+
96
+ @router.post("/export/transcript")
97
+ def export_transcript(payload: TranscriptExportRequest):
98
+ content, filename, mime_type = export_service.generate_transcript_export(payload)
99
+ return StreamingResponse(
100
+ iter([content.encode("utf-8")]),
101
+ media_type=mime_type,
102
+ headers={"Content-Disposition": f"attachment; filename={filename}"},
103
+ )
104
+
105
+
106
+ @router.post("/export/summary")
107
+ def export_summary(payload: SummaryExportRequest):
108
+ content, filename, mime_type = export_service.generate_summary_export(payload)
109
+ return StreamingResponse(
110
+ iter([content.encode("utf-8")]),
111
+ media_type=mime_type,
112
+ headers={"Content-Disposition": f"attachment; filename={filename}"},
113
+ )
src/server/services/asr_service.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, Iterable, List, Optional, Tuple
5
+
6
+ import soundfile as sf
7
+ from fastapi import HTTPException
8
+
9
+ from src.asr import transcribe_file
10
+ from src.diarization import (
11
+ get_diarization_stats,
12
+ init_speaker_embedding_extractor,
13
+ merge_consecutive_utterances,
14
+ merge_transcription_with_diarization,
15
+ perform_speaker_diarization_on_utterances,
16
+ )
17
+ from src.utils import sensevoice_models
18
+
19
+ from ..core.config import get_settings
20
+ from ..models.transcription import DiarizationOptions, TranscriptionRequest
21
+
22
+ settings = get_settings()
23
+
24
+
25
+ def _serialize_utterance(utt: Tuple[float, float, str], speaker: Optional[int] = None) -> Dict[str, object]:
26
+ start, end, text = utt
27
+ payload: Dict[str, object] = {
28
+ "start": round(float(start), 3),
29
+ "end": round(float(end), 3),
30
+ "text": text,
31
+ }
32
+ if speaker is not None:
33
+ payload["speaker"] = int(speaker)
34
+ return payload
35
+
36
+
37
+ def _prepare_model_name(options: TranscriptionRequest) -> str:
38
+ if options.backend == "sensevoice":
39
+ # sensevoice_models stores map from friendly name to repo id
40
+ return sensevoice_models.get(options.model_name, options.model_name)
41
+ return options.model_name
42
+
43
+
44
+ def iter_transcription_events(
45
+ audio_path: Path,
46
+ audio_url: str,
47
+ options: TranscriptionRequest,
48
+ ) -> Iterable[Dict[str, object]]:
49
+ model_name = _prepare_model_name(options)
50
+
51
+ try:
52
+ generator = transcribe_file(
53
+ audio_path=str(audio_path),
54
+ vad_threshold=options.vad_threshold,
55
+ model_name=model_name,
56
+ backend=options.backend,
57
+ language=options.language,
58
+ textnorm=options.textnorm,
59
+ )
60
+
61
+ yield {
62
+ "type": "ready",
63
+ "audioUrl": audio_url,
64
+ "backend": options.backend,
65
+ "model": model_name,
66
+ }
67
+
68
+ final_utterances: List[Tuple[float, float, str]] = []
69
+
70
+ for current_utterance, all_utterances in generator:
71
+ if current_utterance:
72
+ start, end, text = current_utterance
73
+ yield {
74
+ "type": "utterance",
75
+ "utterance": _serialize_utterance((start, end, text)),
76
+ "index": len(all_utterances) - 1,
77
+ }
78
+ final_utterances = list(all_utterances)
79
+
80
+ # Final event with transcript and optional diarization
81
+ diarization_payload = None
82
+ if options.diarization.enable:
83
+ diarization_payload = _run_diarization(audio_path, final_utterances, options.diarization)
84
+
85
+ transcript_text = "\n".join([utt[2] for utt in final_utterances])
86
+
87
+ yield {
88
+ "type": "complete",
89
+ "utterances": [_serialize_utterance(utt) for utt in final_utterances],
90
+ "transcript": transcript_text,
91
+ "diarization": diarization_payload,
92
+ }
93
+
94
+ except Exception as exc: # pragma: no cover
95
+ raise HTTPException(status_code=500, detail=f"Transcription failed: {exc}")
96
+
97
+
98
+ def _run_diarization(
99
+ audio_path: Path,
100
+ utterances: List[Tuple[float, float, str]],
101
+ options: DiarizationOptions,
102
+ ) -> Optional[Dict[str, object]]:
103
+ if not utterances:
104
+ return None
105
+
106
+ extractor_result = init_speaker_embedding_extractor(
107
+ cluster_threshold=options.cluster_threshold,
108
+ num_speakers=options.num_speakers,
109
+ )
110
+ if not extractor_result:
111
+ return None
112
+
113
+ embedding_extractor, config_dict = extractor_result
114
+
115
+ audio, sample_rate = sf.read(str(audio_path), dtype="float32")
116
+ if audio.ndim > 1:
117
+ audio = audio.mean(axis=1)
118
+
119
+ if sample_rate != 16000:
120
+ # Lazy import to avoid mandatory dependency during module import
121
+ from scipy.signal import resample
122
+
123
+ target_num_samples = int(len(audio) * 16000 / sample_rate)
124
+ audio = resample(audio, target_num_samples)
125
+ sample_rate = 16000
126
+
127
+ diarization_segments = perform_speaker_diarization_on_utterances(
128
+ audio=audio,
129
+ sample_rate=sample_rate,
130
+ utterances=utterances,
131
+ embedding_extractor=embedding_extractor,
132
+ config_dict=config_dict,
133
+ progress_callback=None,
134
+ )
135
+
136
+ if not diarization_segments:
137
+ return None
138
+
139
+ merged = merge_transcription_with_diarization(utterances, diarization_segments)
140
+ merged = merge_consecutive_utterances(merged, max_gap=1.0)
141
+ stats = get_diarization_stats(merged)
142
+
143
+ return {
144
+ "utterances": [
145
+ _serialize_utterance((start, end, text), speaker)
146
+ for start, end, text, speaker in merged
147
+ ],
148
+ "stats": stats,
149
+ }
src/server/services/config_service.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict
4
+
5
+ from src.utils import available_gguf_llms, model_names, sensevoice_models
6
+
7
+
8
+ def get_model_catalog() -> Dict[str, object]:
9
+ return {
10
+ "moonshine": model_names,
11
+ "sensevoice": sensevoice_models,
12
+ "llms": {name: {"repo": repo, "filename": filename} for name, (repo, filename) in available_gguf_llms.items()},
13
+ }
src/server/services/export_service.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import Tuple
5
+
6
+ from src.export_utils import (
7
+ SUBTITLE_FORMATS,
8
+ SUMMARY_FORMATS,
9
+ TRANSCRIPT_FORMATS,
10
+ export_plain_text,
11
+ )
12
+
13
+ from ..models.export import SummaryExportRequest, TranscriptExportRequest
14
+
15
+
16
+ def _build_utterance_tuples(payload: TranscriptExportRequest):
17
+ utterances = [(u.start, u.end, u.text) for u in payload.utterances]
18
+ has_speakers = any(u.speaker is not None for u in payload.utterances)
19
+ utterances_with_speakers = None
20
+ if has_speakers:
21
+ utterances_with_speakers = [
22
+ (u.start, u.end, u.text, u.speaker if u.speaker is not None else 0)
23
+ for u in payload.utterances
24
+ ]
25
+ return utterances, utterances_with_speakers
26
+
27
+
28
+ def generate_transcript_export(payload: TranscriptExportRequest) -> Tuple[str, str, str]:
29
+ utterances, utterances_with_speakers = _build_utterance_tuples(payload)
30
+
31
+ if payload.format in SUBTITLE_FORMATS:
32
+ fmt = SUBTITLE_FORMATS[payload.format]
33
+ content = fmt["function"](utterances, utterances_with_speakers)
34
+ elif payload.format in TRANSCRIPT_FORMATS:
35
+ fmt = TRANSCRIPT_FORMATS[payload.format]
36
+ if payload.format == "Plain Text":
37
+ content = export_plain_text(
38
+ utterances,
39
+ utterances_with_speakers,
40
+ include_timestamps=payload.include_timestamps,
41
+ )
42
+ else:
43
+ content = fmt["function"](utterances, utterances_with_speakers)
44
+ else:
45
+ raise ValueError(f"Unsupported transcript export format: {payload.format}")
46
+
47
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
48
+ filename = f"transcript_{timestamp}{fmt['extension']}"
49
+ return content, filename, fmt["mime_type"]
50
+
51
+
52
+ def generate_summary_export(payload: SummaryExportRequest) -> Tuple[str, str, str]:
53
+ if payload.format not in SUMMARY_FORMATS:
54
+ raise ValueError(f"Unsupported summary export format: {payload.format}")
55
+
56
+ fmt = SUMMARY_FORMATS[payload.format]
57
+ content = fmt["function"](payload.summary, payload.metadata)
58
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
59
+ filename = f"summary_{timestamp}{fmt['extension']}"
60
+ return content, filename, fmt["mime_type"]
src/server/services/file_service.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import shutil
4
+ import uuid
5
+ from pathlib import Path
6
+ from typing import Tuple
7
+
8
+ from fastapi import UploadFile
9
+
10
+ from ..core.config import get_settings
11
+
12
+ settings = get_settings()
13
+
14
+
15
+ def cleanup_old_audio_files(max_files: int | None = None) -> None:
16
+ """Remove old audio files from the static directory to save space."""
17
+ max_files = max_files or settings.max_audio_files
18
+ audio_dir = settings.audio_dir
19
+ audio_dir.mkdir(parents=True, exist_ok=True)
20
+
21
+ files = sorted(audio_dir.glob("*") , key=lambda f: f.stat().st_mtime if f.exists() else 0)
22
+ if len(files) <= max_files:
23
+ return
24
+
25
+ for old_file in files[:-max_files]:
26
+ try:
27
+ old_file.unlink()
28
+ except OSError:
29
+ continue
30
+
31
+
32
+ def save_upload_file(upload: UploadFile) -> Path:
33
+ """Persist an UploadFile to the temporary directory and return its path."""
34
+ tmp_dir = settings.tmp_dir
35
+ tmp_dir.mkdir(parents=True, exist_ok=True)
36
+ suffix = Path(upload.filename or "audio").suffix or ".mp3"
37
+ temp_path = tmp_dir / f"upload_{uuid.uuid4().hex}{suffix}"
38
+
39
+ with temp_path.open("wb") as buffer:
40
+ shutil.copyfileobj(upload.file, buffer)
41
+
42
+ return temp_path
43
+
44
+
45
+ def store_audio_file(audio_path: Path, prefix: str | None = None) -> Tuple[Path, str]:
46
+ """Copy an audio file to the public static folder and return the new path and URL."""
47
+ cleanup_old_audio_files()
48
+
49
+ prefix = prefix or "audio"
50
+ suffix = audio_path.suffix or ".mp3"
51
+ dest_filename = f"{prefix}_{uuid.uuid4().hex}{suffix}"
52
+ dest_path = settings.audio_dir / dest_filename
53
+
54
+ shutil.copy2(audio_path, dest_path)
55
+
56
+ url = f"/media/{dest_filename}"
57
+ return dest_path, url
src/server/services/podcast_service.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Optional
5
+
6
+ from fastapi import HTTPException
7
+
8
+ from src.podcast import (
9
+ download_podcast_audio,
10
+ fetch_audio,
11
+ fetch_episodes,
12
+ search_podcast_series,
13
+ )
14
+
15
+ from .file_service import store_audio_file
16
+
17
+
18
+ def search_series(query: str) -> List[Dict[str, object]]:
19
+ return search_podcast_series(query)
20
+
21
+
22
+ def list_episodes(feed_url: str) -> List[Dict[str, object]]:
23
+ return fetch_episodes(feed_url)
24
+
25
+
26
+ def download_episode(audio_url: str, title: str) -> Dict[str, str]:
27
+ file_path, status = download_podcast_audio(audio_url, title, status="Podcast download")
28
+ if not file_path:
29
+ raise HTTPException(status_code=500, detail=status or "Download failed")
30
+
31
+ _, audio_url = store_audio_file(Path(file_path), prefix="podcast")
32
+ return {"audioUrl": audio_url, "status": status}
33
+
34
+
35
+ def fetch_youtube_audio(youtube_url: str) -> Dict[str, str]:
36
+ audio_path, status = fetch_audio(youtube_url, status="YouTube fetch")
37
+ if not audio_path:
38
+ raise HTTPException(status_code=500, detail=status or "YouTube download failed")
39
+
40
+ _, audio_url = store_audio_file(Path(audio_path), prefix="youtube")
41
+ return {"audioUrl": audio_url, "status": status}
src/server/services/summarization_service.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, Iterable
4
+
5
+ from fastapi import HTTPException
6
+
7
+ from src.summarization import summarize_transcript
8
+
9
+ from ..models.summarization import SummaryRequest
10
+
11
+
12
+ def iter_summary_events(payload: SummaryRequest) -> Iterable[Dict[str, str]]:
13
+ try:
14
+ generator = summarize_transcript(
15
+ transcript=payload.transcript,
16
+ selected_gguf_model=payload.llm_model,
17
+ prompt_input=payload.prompt,
18
+ )
19
+
20
+ for chunk in generator:
21
+ yield {"type": "partial", "content": chunk}
22
+
23
+ yield {"type": "complete"}
24
+
25
+ except Exception as exc: # pragma: no cover
26
+ raise HTTPException(status_code=500, detail=f"Summary failed: {exc}")
src/summarization.py CHANGED
@@ -1,10 +1,10 @@
1
  # summarization.py
2
- from llama_cpp import Llama
3
- from utils import available_gguf_llms, s2tw_converter
4
  import time
5
  from functools import lru_cache
6
- import multiprocessing
7
- from utils import num_vcpus
 
 
8
  # Detect logical cores (vCPUs available to the container)
9
  print(f"Detected vCPUs: {num_vcpus}")
10
 
 
1
  # summarization.py
 
 
2
  import time
3
  from functools import lru_cache
4
+
5
+ from llama_cpp import Llama
6
+
7
+ from .utils import available_gguf_llms, num_vcpus, s2tw_converter
8
  # Detect logical cores (vCPUs available to the container)
9
  print(f"Detected vCPUs: {num_vcpus}")
10