Spaces:

Luigi
/

VoxSum

Sleeping

App Files Files Community

Luigi commited on Sep 26

Commit

ba4a241

1 Parent(s): 09695c9

re-implement in FastAPI

Browse files

Files changed (18) hide show

Dockerfile +12 -32
frontend/app.js +768 -0
frontend/index.html +198 -0
frontend/styles.css +448 -0
requirements.txt +6 -4
src/asr.py +8 -11
src/diarization.py +26 -35
src/server/__init__.py +0 -0
src/server/core/config.py +33 -0
src/server/main.py +31 -0
src/server/routers/api.py +113 -0
src/server/services/asr_service.py +149 -0
src/server/services/config_service.py +13 -0
src/server/services/export_service.py +60 -0
src/server/services/file_service.py +57 -0
src/server/services/podcast_service.py +41 -0
src/server/services/summarization_service.py +26 -0
src/summarization.py +4 -4

Dockerfile CHANGED Viewed

@@ -13,49 +13,29 @@ RUN apt-get update && apt-get install -y \
     libopenblas-dev \
     && rm -rf /var/lib/apt/lists/*
-# === CRITICAL FIX + PERFORMANCE OPTIMIZATIONS ===
-# Set Streamlit to use temporary directories for ALL storage
 ENV HOME=/tmp
-ENV STREAMLIT_GLOBAL_DEVELOPMENT_MODE=false
-ENV STREAMLIT_GLOBAL_DATA_PATH=/tmp
-ENV STREAMLIT_CONFIG_DIR=/tmp/.streamlit
 ENV HF_HOME=/tmp/huggingface
-# Create directories with open permissions including static audio directory
-RUN mkdir -p /tmp/.streamlit /tmp/huggingface /app/static && \
-    chmod -R 777 /tmp /app/static
-# Create config file with proper settings for large file handling
-RUN mkdir -p /tmp/.streamlit && \
-    cat <<EOF > /tmp/.streamlit/config.toml
-[browser]
-gatherUsageStats = false
-[server]
-enableCORS = false
-enableXsrfProtection = false
-maxUploadSize = 500
-maxMessageSize = 500
-[runner]
-maxCachedEntries = 1000
-fastReruns = true
-EOF
 # Copy files
 COPY requirements.txt ./
 COPY src/ ./src/
 COPY static/ ./static/
 # Install Python dependencies
 RUN pip3 install --no-cache-dir -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", \
-            "--server.port=8501", \
-            "--server.address=0.0.0.0", \
-            "--server.maxUploadSize=500", \
-            "--server.maxMessageSize=500"]

     libopenblas-dev \
     && rm -rf /var/lib/apt/lists/*
 ENV HOME=/tmp
 ENV HF_HOME=/tmp/huggingface
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONPATH="/app/src"
+ENV PORT=7860
+# Create writable directories used at runtime
+RUN mkdir -p /tmp/huggingface /app/static /app/static/audio /app/tmp && \
+    chmod -R 777 /tmp /app/static /app/tmp
 # Copy files
 COPY requirements.txt ./
 COPY src/ ./src/
 COPY static/ ./static/
+COPY frontend/ ./frontend/
+COPY models/ ./models/
 # Install Python dependencies
 RUN pip3 install --no-cache-dir -r requirements.txt
+EXPOSE 7860
+HEALTHCHECK CMD curl --fail http://localhost:7860/health || exit 1
+ENTRYPOINT ["python", "-m", "uvicorn", "src.server.main:app"]
+CMD ["--host", "0.0.0.0", "--port", "7860"]

frontend/app.js ADDED Viewed

	@@ -0,0 +1,768 @@

+const state = {
+  config: { moonshine: {}, sensevoice: {}, llms: {} },
+  backend: 'sensevoice',
+  utterances: [],
+  diarizedUtterances: null,
+  diarizationStats: null,
+  summary: '',
+  audioUrl: null,
+  sourcePath: null,
+  uploadedFile: null,
+  transcribing: false,
+  summarizing: false,
+};
+const elements = {
+  backendSelect: document.getElementById('backend-select'),
+  modelSelect: document.getElementById('model-select'),
+  llmSelect: document.getElementById('llm-select'),
+  promptInput: document.getElementById('prompt-input'),
+  vadSlider: document.getElementById('vad-threshold'),
+  vadValue: document.getElementById('vad-value'),
+  diarizationToggle: document.getElementById('diarization-toggle'),
+  diarizationSettings: document.getElementById('diarization-settings'),
+  numSpeakers: document.getElementById('num-speakers'),
+  clusterSlider: document.getElementById('cluster-threshold'),
+  clusterValue: document.getElementById('cluster-value'),
+  sensevoiceOptions: document.getElementById('sensevoice-options'),
+  sensevoiceLanguage: document.getElementById('sensevoice-language'),
+  transcribeBtn: document.getElementById('transcribe-btn'),
+  summaryBtn: document.getElementById('summary-btn'),
+  statusText: document.getElementById('status-text'),
+  audioPlayer: document.getElementById('audio-player'),
+  transcriptList: document.getElementById('transcript-list'),
+  transcriptTemplate: document.getElementById('utterance-template'),
+  utteranceCount: document.getElementById('utterance-count'),
+  summaryOutput: document.getElementById('summary-output'),
+  diarizationPanel: document.getElementById('diarization-summary'),
+  diarizationMetrics: document.getElementById('diarization-metrics'),
+  speakerBreakdown: document.getElementById('speaker-breakdown'),
+  transcriptFormat: document.getElementById('transcript-format'),
+  summaryFormat: document.getElementById('summary-format'),
+  exportTranscriptBtn: document.getElementById('export-transcript'),
+  exportSummaryBtn: document.getElementById('export-summary'),
+  includeTimestamps: document.getElementById('include-timestamps'),
+  fileInput: document.getElementById('file-input'),
+  youtubeUrl: document.getElementById('youtube-url'),
+  youtubeFetch: document.getElementById('youtube-fetch'),
+  podcastQuery: document.getElementById('podcast-query'),
+  podcastSearch: document.getElementById('podcast-search'),
+  podcastResults: document.getElementById('podcast-results'),
+  episodeResults: document.getElementById('episode-results'),
+};
+const TRANSCRIPT_FORMATS = [
+  'SRT (SubRip)',
+  'VTT (WebVTT)',
+  'ASS (Advanced SubStation Alpha)',
+  'Plain Text',
+  'JSON',
+  'ELAN (EAF)',
+];
+const SUMMARY_FORMATS = ['Markdown', 'Plain Text'];
+let activeTab = 'podcast-tab';
+let activeUtteranceIndex = -1;
+function setStatus(message, tone = 'info') {
+  elements.statusText.textContent = message;
+  elements.statusText.dataset.tone = tone;
+}
+function formatTime(seconds) {
+  const mins = Math.floor(seconds / 60);
+  const secs = Math.floor(seconds % 60).toString().padStart(2, '0');
+  return `${mins}:${secs}`;
+}
+function setListEmpty(container, message) {
+  if (!container) return;
+  container.innerHTML = `<div class="empty-state">${message}</div>`;
+}
+async function fetchConfig() {
+  try {
+    const res = await fetch('/api/config/models');
+    if (!res.ok) throw new Error('Failed to fetch model catalog');
+    state.config = await res.json();
+    populateModelSelect();
+    populateLLMSelect();
+    populateExportSelects();
+  } catch (err) {
+    console.error(err);
+    setStatus(err.message, 'error');
+  }
+}
+function populateModelSelect() {
+  const backend = state.backend;
+  elements.modelSelect.innerHTML = '';
+  const models = backend === 'moonshine' ? state.config.moonshine : state.config.sensevoice;
+  Object.entries(models).forEach(([label, value]) => {
+    const option = document.createElement('option');
+    option.value = value;
+    option.textContent = label;
+    elements.modelSelect.appendChild(option);
+  });
+  if (elements.modelSelect.options.length > 0) {
+    elements.modelSelect.selectedIndex = 0;
+  }
+  elements.sensevoiceOptions.classList.toggle('hidden', backend !== 'sensevoice');
+}
+function populateLLMSelect() {
+  elements.llmSelect.innerHTML = '';
+  Object.keys(state.config.llms).forEach((name) => {
+    const option = document.createElement('option');
+    option.value = name;
+    option.textContent = name;
+    elements.llmSelect.appendChild(option);
+  });
+}
+function populateExportSelects() {
+  elements.transcriptFormat.innerHTML = '';
+  TRANSCRIPT_FORMATS.forEach((fmt) => {
+    const option = document.createElement('option');
+    option.value = fmt;
+    option.textContent = fmt;
+    elements.transcriptFormat.appendChild(option);
+  });
+  elements.summaryFormat.innerHTML = '';
+  SUMMARY_FORMATS.forEach((fmt) => {
+    const option = document.createElement('option');
+    option.value = fmt;
+    option.textContent = fmt;
+    elements.summaryFormat.appendChild(option);
+  });
+}
+function initTabs() {
+  document.querySelectorAll('.tab').forEach((tab) => {
+    tab.addEventListener('click', () => {
+      if (tab.dataset.target === activeTab) return;
+      document.querySelectorAll('.tab').forEach((btn) => btn.classList.remove('active'));
+      document.querySelectorAll('.tab-panel').forEach((panel) => panel.classList.remove('active'));
+      tab.classList.add('active');
+      document.getElementById(tab.dataset.target).classList.add('active');
+      activeTab = tab.dataset.target;
+    });
+  });
+}
+function initSidebarInteractions() {
+  elements.backendSelect.addEventListener('change', () => {
+    state.backend = elements.backendSelect.value;
+    populateModelSelect();
+  });
+  elements.vadSlider.addEventListener('input', () => {
+    elements.vadValue.textContent = Number(elements.vadSlider.value).toFixed(2);
+  });
+  elements.diarizationToggle.addEventListener('change', () => {
+    elements.diarizationSettings.classList.toggle('hidden', !elements.diarizationToggle.checked);
+  });
+  elements.clusterSlider.addEventListener('input', () => {
+    elements.clusterValue.textContent = Number(elements.clusterSlider.value).toFixed(2);
+  });
+}
+function resetTranscriptionState() {
+  state.utterances = [];
+  state.diarizedUtterances = null;
+  state.diarizationStats = null;
+  activeUtteranceIndex = -1;
+  elements.transcriptList.innerHTML = '';
+  elements.utteranceCount.textContent = '';
+  elements.diarizationPanel.classList.add('hidden');
+}
+function prepareTranscriptionOptions() {
+  const textnormValue = document.querySelector('input[name="textnorm"]:checked')?.value || 'withitn';
+  return {
+    backend: state.backend,
+    model_name: elements.modelSelect.value,
+    vad_threshold: Number(elements.vadSlider.value),
+    language: state.backend === 'sensevoice' ? elements.sensevoiceLanguage.value : 'auto',
+    textnorm: textnormValue,
+    diarization: {
+      enable: elements.diarizationToggle.checked,
+      num_speakers: Number(elements.numSpeakers.value || -1),
+      cluster_threshold: Number(elements.clusterSlider.value),
+    },
+  };
+}
+async function handleTranscription() {
+  if (state.transcribing) return;
+  if (!state.uploadedFile && !state.audioUrl) {
+    setStatus('Upload or select an audio source first', 'warning');
+    return;
+  }
+  resetTranscriptionState();
+  state.transcribing = true;
+  setStatus('Starting transcription...', 'info');
+  const formData = new FormData();
+  if (state.uploadedFile) {
+    formData.append('audio', state.uploadedFile, state.uploadedFile.name);
+  } else if (state.audioUrl) {
+    formData.append('source', state.audioUrl);
+  }
+  formData.append('options', JSON.stringify(prepareTranscriptionOptions()));
+  try {
+    const response = await fetch('/api/transcribe', {
+      method: 'POST',
+      body: formData,
+    });
+    if (!response.ok || !response.body) {
+      throw new Error('Transcription request failed');
+    }
+    const reader = response.body.getReader();
+    const decoder = new TextDecoder();
+    let buffer = '';
+    setStatus('Processing audio...', 'info');
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      buffer += decoder.decode(value, { stream: true });
+      let lines = buffer.split('\n');
+      buffer = lines.pop();
+      for (const line of lines) {
+        if (!line.trim()) continue;
+        const event = JSON.parse(line);
+        handleTranscriptionEvent(event);
+      }
+    }
+    if (buffer.trim()) {
+      handleTranscriptionEvent(JSON.parse(buffer));
+    }
+    setStatus('Transcription complete', 'success');
+  } catch (err) {
+    console.error(err);
+    setStatus(err.message, 'error');
+  } finally {
+    state.transcribing = false;
+  }
+}
+function handleTranscriptionEvent(event) {
+  switch (event.type) {
+    case 'ready':
+      if (event.audioUrl) {
+        state.audioUrl = event.audioUrl;
+        elements.audioPlayer.src = event.audioUrl;
+        elements.audioPlayer.currentTime = 0;
+      }
+      break;
+    case 'utterance':
+      if (event.utterance) {
+        state.utterances.push(event.utterance);
+        renderTranscript();
+      }
+      break;
+    case 'complete':
+      if (event.diarization) {
+        state.diarizedUtterances = event.diarization.utterances || [];
+        state.diarizationStats = event.diarization.stats || null;
+      }
+      if (event.utterances) {
+        const diarized = state.diarizedUtterances?.length ? state.diarizedUtterances : null;
+        state.utterances = diarized
+          ? diarized.map((utt, index) => ({
+              ...(event.utterances[index] || {}),
+              ...utt,
+            }))
+          : event.utterances;
+      } else if (state.diarizedUtterances?.length) {
+        state.utterances = state.diarizedUtterances;
+      }
+      renderTranscript();
+      renderDiarizationStats();
+      break;
+    case 'error':
+      setStatus(event.message || 'Transcription error', 'error');
+      break;
+  }
+}
+function renderTranscript() {
+  elements.transcriptList.innerHTML = '';
+  const fragment = document.createDocumentFragment();
+  state.utterances.forEach((utt, index) => {
+    const node = elements.transcriptTemplate.content.cloneNode(true);
+    const item = node.querySelector('.utterance-item');
+    item.dataset.index = index.toString();
+    item.dataset.start = utt.start;
+    item.dataset.end = utt.end;
+    node.querySelector('.timestamp').textContent = `[${formatTime(utt.start)}]`;
+    node.querySelector('.utterance-text').textContent = utt.text;
+    const speakerTag = node.querySelector('.speaker-tag');
+    if (typeof utt.speaker === 'number') {
+      speakerTag.textContent = `Speaker ${utt.speaker + 1}`;
+      speakerTag.classList.remove('hidden');
+    }
+    fragment.appendChild(node);
+  });
+  elements.transcriptList.appendChild(fragment);
+  elements.utteranceCount.textContent = `${state.utterances.length} segments`;
+}
+function renderDiarizationStats() {
+  if (!state.diarizationStats) {
+    elements.diarizationPanel.classList.add('hidden');
+    return;
+  }
+  elements.diarizationPanel.classList.remove('hidden');
+  const stats = state.diarizationStats;
+  elements.diarizationMetrics.innerHTML = '';
+  const metricsFragment = document.createDocumentFragment();
+  const totalCard = document.createElement('div');
+  totalCard.className = 'metric-card';
+  totalCard.innerHTML = `<strong>Total speakers:</strong> ${stats.total_speakers || 0}<br/><strong>Duration:</strong> ${stats.total_duration?.toFixed(1) || 0}s`;
+  metricsFragment.appendChild(totalCard);
+  elements.diarizationMetrics.appendChild(metricsFragment);
+  elements.speakerBreakdown.innerHTML = '';
+  const speakersFragment = document.createDocumentFragment();
+  Object.entries(stats.speakers || {}).forEach(([speakerId, info]) => {
+    const card = document.createElement('div');
+    card.className = 'metric-card';
+    card.innerHTML = `
+      <strong>Speaker ${Number(speakerId) + 1}</strong><br/>
+      Speaking time: ${info.speaking_time.toFixed(1)}s<br/>
+      Percentage: ${info.percentage.toFixed(1)}%<br/>
+      Utterances: ${info.utterances}<br/>
+      Avg length: ${info.avg_utterance_length.toFixed(1)}s
+    `;
+    speakersFragment.appendChild(card);
+  });
+  elements.speakerBreakdown.appendChild(speakersFragment);
+}
+function findActiveUtterance(currentTime) {
+  let left = 0;
+  let right = state.utterances.length - 1;
+  let match = -1;
+  while (left <= right) {
+    const mid = Math.floor((left + right) / 2);
+    const utt = state.utterances[mid];
+    if (currentTime >= utt.start && currentTime < utt.end) {
+      return mid;
+    }
+    if (currentTime < utt.start) {
+      right = mid - 1;
+    } else {
+      match = mid;
+      left = mid + 1;
+    }
+  }
+  return match;
+}
+function updateActiveUtterance(index) {
+  if (index === activeUtteranceIndex) return;
+  const previous = elements.transcriptList.querySelector('.utterance-item.active');
+  if (previous) previous.classList.remove('active');
+  const current = elements.transcriptList.querySelector(`.utterance-item[data-index="${index}"]`);
+  if (current) {
+    current.classList.add('active');
+    current.scrollIntoView({ behavior: 'smooth', block: 'center' });
+  }
+  activeUtteranceIndex = index;
+}
+function initAudioInteractions() {
+  elements.audioPlayer.addEventListener('timeupdate', () => {
+    if (!state.utterances.length) return;
+    const idx = findActiveUtterance(elements.audioPlayer.currentTime);
+    if (idx >= 0) updateActiveUtterance(idx);
+  });
+  elements.transcriptList.addEventListener('click', (event) => {
+    const item = event.target.closest('.utterance-item');
+    if (!item) return;
+    const editButton = event.target.closest('.edit-btn');
+    const saveButton = event.target.closest('.save-edit');
+    const cancelButton = event.target.closest('.cancel-edit');
+    const index = Number(item.dataset.index);
+    if (editButton) {
+      toggleEdit(item, true);
+      return;
+    }
+    if (saveButton) {
+      const textarea = item.querySelector('textarea');
+      const newText = textarea.value.trim();
+      if (newText.length === 0) return;
+      state.utterances[index].text = newText;
+      item.querySelector('.utterance-text').textContent = newText;
+      toggleEdit(item, false);
+      return;
+    }
+    if (cancelButton) {
+      toggleEdit(item, false);
+      return;
+    }
+    const start = Number(item.dataset.start);
+    seekToTime(start);
+  });
+}
+function toggleEdit(item, editing) {
+  const textBlock = item.querySelector('.utterance-text');
+  const editArea = item.querySelector('.edit-area');
+  if (!textBlock || !editArea) return;
+  if (editing) {
+    const textarea = editArea.querySelector('textarea');
+    textarea.value = textBlock.textContent;
+    textBlock.classList.add('hidden');
+    editArea.classList.remove('hidden');
+  } else {
+    textBlock.classList.remove('hidden');
+    editArea.classList.add('hidden');
+  }
+}
+function seekToTime(timeInSeconds) {
+  if (!Number.isFinite(timeInSeconds)) return;
+  const audio = elements.audioPlayer;
+  const executeSeek = () => {
+    audio.currentTime = Math.max(0, timeInSeconds);
+    updateActiveUtterance(findActiveUtterance(audio.currentTime));
+    audio.play().catch(() => {});
+  };
+  if (audio.readyState >= 1) {
+    executeSeek();
+  } else {
+    const onLoaded = () => {
+      executeSeek();
+      audio.removeEventListener('loadedmetadata', onLoaded);
+    };
+    audio.addEventListener('loadedmetadata', onLoaded);
+    audio.load();
+  }
+}
+async function handleSummaryGeneration() {
+  if (state.summarizing || !state.utterances.length) return;
+  state.summarizing = true;
+  setStatus('Generating summary...', 'info');
+  elements.summaryOutput.textContent = '';
+  const payload = {
+    transcript: state.utterances.map((u) => u.text).join('\n'),
+    llm_model: elements.llmSelect.value,
+    prompt: elements.promptInput.value || 'Summarize the transcript below.',
+  };
+  try {
+    const response = await fetch('/api/summarize', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(payload),
+    });
+    if (!response.ok || !response.body) throw new Error('Failed to generate summary');
+    const reader = response.body.getReader();
+    const decoder = new TextDecoder();
+    let buffer = '';
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      buffer += decoder.decode(value, { stream: true });
+      let lines = buffer.split('\n');
+      buffer = lines.pop();
+      for (const line of lines) {
+        if (!line.trim()) continue;
+        const event = JSON.parse(line);
+        if (event.type === 'partial' && event.content) {
+          elements.summaryOutput.textContent = event.content;
+        }
+      }
+    }
+    setStatus('Summary ready', 'success');
+  } catch (err) {
+    console.error(err);
+    setStatus(err.message, 'error');
+  } finally {
+    state.summarizing = false;
+  }
+}
+async function handleExportTranscript() {
+  if (!state.utterances.length) return;
+  const payload = {
+    format: elements.transcriptFormat.value,
+    include_timestamps: elements.includeTimestamps.checked,
+    utterances: state.utterances,
+  };
+  await downloadFile('/api/export/transcript', payload, 'transcript');
+}
+async function handleExportSummary() {
+  if (!elements.summaryOutput.textContent.trim()) return;
+  const payload = {
+    format: elements.summaryFormat.value,
+    summary: elements.summaryOutput.textContent,
+    metadata: {},
+  };
+  await downloadFile('/api/export/summary', payload, 'summary');
+}
+async function downloadFile(url, payload, prefix) {
+  try {
+    const response = await fetch(url, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(payload),
+    });
+    if (!response.ok) throw new Error('Export failed');
+    const blob = await response.blob();
+    const filename = getFilenameFromDisposition(response.headers.get('Content-Disposition')) || `${prefix}.txt`;
+    const link = document.createElement('a');
+    link.href = URL.createObjectURL(blob);
+    link.download = filename;
+    link.click();
+    URL.revokeObjectURL(link.href);
+    setStatus('Export complete', 'success');
+  } catch (err) {
+    console.error(err);
+    setStatus(err.message, 'error');
+  }
+}
+function getFilenameFromDisposition(disposition) {
+  if (!disposition) return null;
+  const match = disposition.match(/filename="?([^"]+)"?/i);
+  return match ? match[1] : null;
+}
+function handleFileUpload(event) {
+  const file = event.target.files?.[0];
+  if (!file) return;
+  state.uploadedFile = file;
+  state.audioUrl = null;
+  const objectUrl = URL.createObjectURL(file);
+  elements.audioPlayer.src = objectUrl;
+  setStatus(`Loaded ${file.name}`, 'info');
+}
+async function handleYoutubeFetch() {
+  if (!elements.youtubeUrl.value.trim()) return;
+  setStatus('Downloading audio from YouTube...', 'info');
+  try {
+    const res = await fetch('/api/youtube/fetch', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ url: elements.youtubeUrl.value.trim() }),
+    });
+    if (!res.ok) throw new Error('YouTube download failed');
+    const data = await res.json();
+    state.audioUrl = data.audioUrl;
+    state.uploadedFile = null;
+    elements.audioPlayer.src = data.audioUrl;
+    setStatus('YouTube audio ready', 'success');
+  } catch (err) {
+    console.error(err);
+    setStatus(err.message, 'error');
+  }
+}
+async function handlePodcastSearch() {
+  const query = elements.podcastQuery.value.trim();
+  if (!query) return;
+  setStatus('Searching podcasts...', 'info');
+  setListEmpty(elements.podcastResults, 'Searching podcasts...');
+  setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
+  try {
+    const res = await fetch(`/api/podcast/search?query=${encodeURIComponent(query)}`);
+    if (!res.ok) throw new Error('Podcast search failed');
+    const series = await res.json();
+    if (!series.length) {
+      setListEmpty(elements.podcastResults, 'No podcasts match your search yet.');
+      return;
+    }
+    elements.podcastResults.innerHTML = '';
+    const fragment = document.createDocumentFragment();
+    series.forEach((item) => {
+      const div = document.createElement('div');
+      div.className = 'list-item';
+      div.innerHTML = `
+        <div>
+          <strong>${item.title}</strong><br/>
+          <span>${item.artist || 'Unknown artist'}</span>
+        </div>
+        <button data-feed="${item.feed_url}">Episodes</button>
+      `;
+      fragment.appendChild(div);
+    });
+    elements.podcastResults.appendChild(fragment);
+    setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
+  } catch (err) {
+    console.error(err);
+    setStatus(err.message, 'error');
+    setListEmpty(elements.podcastResults, 'Unable to load podcasts right now.');
+  }
+}
+async function loadEpisodes(feedUrl, sourceItem = null) {
+  setStatus('Loading episodes...', 'info');
+  if (sourceItem) {
+    elements.podcastResults.querySelectorAll('.list-item').forEach((item) => item.classList.remove('selected'));
+    sourceItem.classList.add('selected');
+  }
+  setListEmpty(elements.episodeResults, 'Loading episodes...');
+  try {
+    const res = await fetch(`/api/podcast/episodes?feed_url=${encodeURIComponent(feedUrl)}`);
+    if (!res.ok) throw new Error('Failed to load episodes');
+    const episodes = await res.json();
+    if (!episodes.length) {
+      setListEmpty(elements.episodeResults, 'No episodes available for this podcast.');
+      return;
+    }
+    elements.episodeResults.innerHTML = '';
+    const fragment = document.createDocumentFragment();
+    episodes.slice(0, 15).forEach((ep) => {
+      const div = document.createElement('div');
+      div.className = 'list-item';
+      div.innerHTML = `
+        <div>
+          <strong>${ep.title}</strong><br/>
+          <span>${ep.published || ''}</span>
+        </div>
+        <button data-url="${ep.audio_url}" data-title="${ep.title}">Download</button>
+      `;
+      fragment.appendChild(div);
+    });
+    elements.episodeResults.appendChild(fragment);
+    setStatus('Episodes ready', 'success');
+  } catch (err) {
+    console.error(err);
+    setStatus(err.message, 'error');
+    setListEmpty(elements.episodeResults, 'Unable to load episodes right now.');
+  }
+}
+async function downloadEpisode(audioUrl, title, triggerButton = null) {
+  setStatus('Downloading episode...', 'info');
+  let originalLabel = null;
+  if (triggerButton) {
+    originalLabel = triggerButton.innerHTML;
+    triggerButton.disabled = true;
+    triggerButton.classList.add('loading');
+    triggerButton.textContent = 'Downloading…';
+  }
+  try {
+    const res = await fetch('/api/podcast/download', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ audioUrl, title }),
+    });
+    if (!res.ok) throw new Error('Episode download failed');
+    const data = await res.json();
+    state.audioUrl = data.audioUrl;
+    state.uploadedFile = null;
+    elements.audioPlayer.src = data.audioUrl;
+    setStatus('Episode ready', 'success');
+    if (triggerButton) {
+      triggerButton.textContent = 'Ready ✓';
+      triggerButton.classList.add('success');
+    }
+  } catch (err) {
+    console.error(err);
+    setStatus(err.message, 'error');
+    if (triggerButton) {
+      triggerButton.textContent = 'Retry';
+      triggerButton.classList.add('error');
+    }
+  } finally {
+    if (triggerButton) {
+      triggerButton.disabled = false;
+      triggerButton.classList.remove('loading');
+      setTimeout(() => {
+        triggerButton.classList.remove('success', 'error');
+        triggerButton.textContent = originalLabel || 'Download';
+      }, 2000);
+    }
+  }
+}
+function initPodcastInteractions() {
+  elements.podcastResults.addEventListener('click', (event) => {
+    const btn = event.target.closest('button[data-feed]');
+    if (!btn) return;
+    const listItem = btn.closest('.list-item');
+    loadEpisodes(btn.dataset.feed, listItem);
+  });
+  elements.episodeResults.addEventListener('click', (event) => {
+    const btn = event.target.closest('button[data-url]');
+    if (!btn) return;
+    downloadEpisode(btn.dataset.url, btn.dataset.title, btn);
+  });
+}
+function initEventBindings() {
+  elements.transcribeBtn.addEventListener('click', handleTranscription);
+  elements.summaryBtn.addEventListener('click', handleSummaryGeneration);
+  elements.exportTranscriptBtn.addEventListener('click', handleExportTranscript);
+  elements.exportSummaryBtn.addEventListener('click', handleExportSummary);
+  elements.fileInput.addEventListener('change', handleFileUpload);
+  elements.youtubeFetch.addEventListener('click', handleYoutubeFetch);
+  elements.podcastSearch.addEventListener('click', handlePodcastSearch);
+  elements.podcastQuery.addEventListener('keydown', (event) => {
+    if (event.key === 'Enter') {
+      event.preventDefault();
+      handlePodcastSearch();
+    }
+  });
+}
+async function init() {
+  initTabs();
+  initSidebarInteractions();
+  initAudioInteractions();
+  initEventBindings();
+  initPodcastInteractions();
+  elements.backendSelect.innerHTML = `
+    <option value="moonshine">Moonshine</option>
+    <option value="sensevoice" selected>SenseVoice</option>
+  `;
+  state.backend = elements.backendSelect.value;
+  setListEmpty(elements.podcastResults, 'Search to discover podcasts.');
+  setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
+  await fetchConfig();
+  setStatus('Ready');
+}
+init();

frontend/index.html ADDED Viewed

	@@ -0,0 +1,198 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>VoxSum Studio</title>
+  <link rel="stylesheet" href="/styles.css" />
+  </head>
+  <body>
+    <header class="app-header">
+      <h1>VoxSum Studio</h1>
+      <p class="tagline">Transform Audio into Insightful Summaries</p>
+    </header>
+    <div class="app-shell">
+      <aside class="sidebar">
+        <section class="panel">
+          <h2>ASR Settings</h2>
+          <label for="backend-select">Backend</label>
+          <select id="backend-select"></select>
+          <label for="model-select">Model</label>
+          <select id="model-select"></select>
+          <div id="sensevoice-options" class="conditional hidden">
+            <label for="sensevoice-language">Language</label>
+            <select id="sensevoice-language">
+              <option value="auto">Auto</option>
+              <option value="zh">Chinese</option>
+              <option value="en">English</option>
+              <option value="ja">Japanese</option>
+              <option value="ko">Korean</option>
+              <option value="yue">Cantonese</option>
+            </select>
+            <label>Text Normalization</label>
+            <div class="radio-group">
+              <label><input type="radio" name="textnorm" value="withitn" checked /> With ITN</label>
+              <label><input type="radio" name="textnorm" value="noitn" /> Raw</label>
+            </div>
+          </div>
+          <label for="vad-threshold">VAD Threshold</label>
+          <input id="vad-threshold" type="range" min="0.1" max="0.9" step="0.05" value="0.5" />
+          <span id="vad-value" class="hint">0.50</span>
+        </section>
+        <section class="panel">
+          <h2>Diarization</h2>
+          <label class="toggle">
+            <input id="diarization-toggle" type="checkbox" /> Enable speaker diarization
+          </label>
+          <div id="diarization-settings" class="conditional hidden">
+            <label for="num-speakers">Number of speakers (-1 = auto)</label>
+            <input id="num-speakers" type="number" min="-1" max="10" value="-1" />
+            <label for="cluster-threshold">Cluster threshold</label>
+            <input id="cluster-threshold" type="range" min="0.1" max="1" step="0.05" value="0.5" />
+            <span id="cluster-value" class="hint">0.50</span>
+          </div>
+        </section>
+        <section class="panel">
+          <h2>Summarization</h2>
+          <label for="llm-select">LLM Model</label>
+          <select id="llm-select"></select>
+          <label for="prompt-input">Custom Prompt</label>
+          <textarea id="prompt-input" rows="4">Summarize the transcript below.</textarea>
+        </section>
+      </aside>
+      <main class="content">
+        <nav class="tabs">
+          <button class="tab active" data-target="podcast-tab">🎙️ Podcast</button>
+          <button class="tab" data-target="audio-tab">🎵 Audio Input</button>
+          <button class="tab" data-target="results-tab">📄 Results</button>
+        </nav>
+        <section id="podcast-tab" class="tab-panel active">
+          <div class="panel">
+            <h2>Search Podcasts</h2>
+            <div class="form-row">
+              <input id="podcast-query" type="text" placeholder="Podcast title" />
+              <button id="podcast-search">Search</button>
+            </div>
+            <div class="list-grid">
+              <section class="list-section">
+                <header class="list-section-header">
+                  <h3>Podcast Channels</h3>
+                  <p class="list-hint">Pick a show to reveal recent episodes.</p>
+                </header>
+                <div id="podcast-results" class="list"></div>
+              </section>
+              <section class="list-section">
+                <header class="list-section-header">
+                  <h3>Episodes</h3>
+                  <p class="list-hint">Episodes for the selected podcast appear here.</p>
+                </header>
+                <div id="episode-results" class="list"></div>
+              </section>
+            </div>
+          </div>
+        </section>
+        <section id="audio-tab" class="tab-panel">
+          <div class="panel">
+            <h2>YouTube</h2>
+            <div class="form-row">
+              <input id="youtube-url" type="url" placeholder="https://youtube.com/..." />
+              <button id="youtube-fetch">Fetch Audio</button>
+            </div>
+          </div>
+          <div class="panel">
+            <h2>Upload Audio</h2>
+            <input id="file-input" type="file" accept="audio/*" />
+          </div>
+        </section>
+        <section id="results-tab" class="tab-panel">
+          <div class="actions">
+            <button id="transcribe-btn" class="primary">Transcribe Audio</button>
+            <button id="summary-btn" class="secondary">Generate Summary</button>
+            <span id="status-text" class="status-text">Ready</span>
+          </div>
+          <section class="panel">
+            <h2>Audio Player</h2>
+            <audio id="audio-player" controls preload="auto"></audio>
+          </section>
+          <section class="panel">
+            <div class="panel-header">
+              <h2>Transcript</h2>
+              <span id="utterance-count" class="hint"></span>
+            </div>
+            <div id="transcript-container">
+              <ul id="transcript-list"></ul>
+            </div>
+          </section>
+          <section id="diarization-summary" class="panel hidden">
+            <h2>Speaker Analysis</h2>
+            <div id="diarization-metrics"></div>
+            <div id="speaker-breakdown"></div>
+          </section>
+          <section class="panel">
+            <h2>Summary</h2>
+            <div id="summary-output" class="summary"></div>
+          </section>
+          <section class="panel">
+            <h2>Export</h2>
+            <div class="export-grid">
+              <div>
+                <label for="transcript-format">Transcript format</label>
+                <select id="transcript-format"></select>
+              </div>
+              <div>
+                <label class="toggle">
+                  <input id="include-timestamps" type="checkbox" checked /> Include timestamps
+                </label>
+              </div>
+              <button id="export-transcript">Export Transcript</button>
+              <div>
+                <label for="summary-format">Summary format</label>
+                <select id="summary-format"></select>
+              </div>
+              <button id="export-summary">Export Summary</button>
+            </div>
+          </section>
+        </section>
+      </main>
+    </div>
+    <template id="utterance-template">
+      <li class="utterance-item">
+        <div class="utterance-header">
+          <span class="timestamp"></span>
+          <span class="speaker-tag hidden"></span>
+          <div class="utterance-actions">
+            <button class="edit-btn" title="Edit">✏️</button>
+          </div>
+        </div>
+        <div class="utterance-text"></div>
+        <div class="edit-area hidden">
+          <textarea rows="3"></textarea>
+          <div class="edit-controls">
+            <button class="save-edit">Save</button>
+            <button class="cancel-edit">Cancel</button>
+          </div>
+        </div>
+      </li>
+    </template>
+    <script src="/app.js" type="module"></script>
+  </body>
+</html>

frontend/styles.css ADDED Viewed

	@@ -0,0 +1,448 @@

+* {
+  box-sizing: border-box;
+}
+body {
+  margin: 0;
+  font-family: 'Inter', 'Segoe UI', sans-serif;
+  background: linear-gradient(180deg, #0f172a 0%, #111827 100%);
+  color: #e5e7eb;
+  min-height: 100vh;
+}
+.app-header {
+  padding: 2rem 3rem 1.5rem;
+  background: rgba(15, 23, 42, 0.8);
+  backdrop-filter: blur(10px);
+  border-bottom: 1px solid rgba(148, 163, 184, 0.2);
+}
+.app-header h1 {
+  margin: 0;
+  font-size: 2.5rem;
+  letter-spacing: 0.05em;
+}
+.app-header .tagline {
+  margin: 0.5rem 0 0;
+  color: #94a3b8;
+}
+.app-shell {
+  display: grid;
+  grid-template-columns: 320px 1fr;
+  gap: 1.5rem;
+  padding: 1.5rem 2rem 3rem;
+}
+.sidebar {
+  display: flex;
+  flex-direction: column;
+  gap: 1.5rem;
+}
+.panel {
+  background: rgba(30, 41, 59, 0.7);
+  border: 1px solid rgba(148, 163, 184, 0.15);
+  border-radius: 16px;
+  padding: 1.25rem;
+  box-shadow: 0 20px 45px rgba(15, 23, 42, 0.35);
+}
+.panel h2 {
+  margin: 0 0 1rem;
+  font-size: 1.1rem;
+  letter-spacing: 0.02em;
+}
+.panel-header {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  margin-bottom: 0.5rem;
+}
+label {
+  display: block;
+  font-size: 0.9rem;
+  margin-bottom: 0.35rem;
+  color: #cbd5f5;
+}
+input[type="text"],
+input[type="url"],
+input[type="number"],
+select,
+textarea {
+  width: 100%;
+  padding: 0.6rem 0.75rem;
+  border-radius: 10px;
+  border: 1px solid rgba(148, 163, 184, 0.2);
+  background: rgba(15, 23, 42, 0.6);
+  color: #e5e7eb;
+  font: inherit;
+  transition: border-color 0.2s ease, box-shadow 0.2s ease;
+}
+input:focus,
+select:focus,
+textarea:focus {
+  outline: none;
+  border-color: #38bdf8;
+  box-shadow: 0 0 0 2px rgba(56, 189, 248, 0.15);
+}
+textarea {
+  resize: vertical;
+}
+input[type="range"] {
+  width: 100%;
+  margin: 0.5rem 0;
+}
+.hint {
+  font-size: 0.8rem;
+  color: #94a3b8;
+}
+.toggle {
+  display: flex;
+  align-items: center;
+  gap: 0.6rem;
+  font-size: 0.9rem;
+}
+.radio-group {
+  display: flex;
+  gap: 0.75rem;
+  margin-bottom: 0.5rem;
+}
+.radio-group input {
+  margin-right: 0.35rem;
+}
+.content {
+  display: flex;
+  flex-direction: column;
+  gap: 1.5rem;
+}
+.tabs {
+  display: inline-flex;
+  background: rgba(30, 41, 59, 0.6);
+  border-radius: 999px;
+  padding: 0.4rem;
+  width: fit-content;
+  border: 1px solid rgba(148, 163, 184, 0.2);
+}
+.tab {
+  border: none;
+  background: transparent;
+  color: #94a3b8;
+  padding: 0.6rem 1.2rem;
+  border-radius: 999px;
+  font: inherit;
+  cursor: pointer;
+  transition: all 0.2s ease;
+}
+.tab.active {
+  background: linear-gradient(135deg, #38bdf8 0%, #818cf8 100%);
+  color: #0f172a;
+  font-weight: 600;
+}
+.tab-panel {
+  display: none;
+}
+.tab-panel.active {
+  display: block;
+}
+.form-row {
+  display: flex;
+  gap: 0.75rem;
+}
+.form-row input {
+  flex: 1;
+}
+.list {
+  margin-top: 1rem;
+  display: grid;
+  gap: 0.75rem;
+}
+.list-grid {
+  margin-top: 1.5rem;
+  display: grid;
+  gap: 1.25rem;
+  grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
+  align-items: start;
+}
+.list-section {
+  background: rgba(15, 23, 42, 0.4);
+  border: 1px solid rgba(148, 163, 184, 0.18);
+  border-radius: 16px;
+  padding: 1rem;
+  display: flex;
+  flex-direction: column;
+  gap: 0.75rem;
+}
+.list-section-header {
+  display: flex;
+  flex-direction: column;
+  gap: 0.35rem;
+}
+.list-section-header h3 {
+  margin: 0;
+  font-size: 1rem;
+  letter-spacing: 0.02em;
+}
+.list-hint {
+  margin: 0;
+  font-size: 0.85rem;
+  color: #9ca3af;
+}
+.list-item {
+  padding: 0.75rem;
+  border-radius: 12px;
+  background: rgba(15, 23, 42, 0.55);
+  border: 1px solid rgba(148, 163, 184, 0.15);
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  gap: 1rem;
+}
+.list-item button {
+  flex-shrink: 0;
+  display: inline-flex;
+  align-items: center;
+  gap: 0.45rem;
+  transition: all 0.2s ease;
+}
+.list-item button.loading {
+  pointer-events: none;
+  opacity: 0.75;
+}
+.list-item button.loading::before {
+  content: '';
+  width: 0.9rem;
+  height: 0.9rem;
+  border-radius: 50%;
+  border: 2px solid rgba(148, 163, 184, 0.4);
+  border-top-color: #38bdf8;
+  animation: spin 0.8s linear infinite;
+}
+.list-item button.success {
+  background: rgba(34, 197, 94, 0.18);
+  border-color: rgba(34, 197, 94, 0.35);
+  color: #86efac;
+}
+.list-item button.error {
+  background: rgba(248, 113, 113, 0.18);
+  border-color: rgba(248, 113, 113, 0.35);
+  color: #fca5a5;
+}
+@keyframes spin {
+  to {
+    transform: rotate(360deg);
+  }
+}
+.list-item.selected {
+  border-color: rgba(56, 189, 248, 0.6);
+  box-shadow: 0 0 0 2px rgba(56, 189, 248, 0.15);
+}
+.empty-state {
+  padding: 1rem;
+  text-align: center;
+  border: 1px dashed rgba(148, 163, 184, 0.25);
+  border-radius: 12px;
+  color: #94a3b8;
+  font-size: 0.9rem;
+  background: rgba(15, 23, 42, 0.35);
+}
+.actions {
+  display: flex;
+  gap: 1rem;
+  align-items: center;
+}
+button {
+  border: none;
+  border-radius: 10px;
+  padding: 0.65rem 1.1rem;
+  font: inherit;
+  cursor: pointer;
+  color: #0f172a;
+  background: #e2e8f0;
+  transition: transform 0.2s ease, box-shadow 0.2s ease;
+}
+button.primary {
+  background: linear-gradient(135deg, #38bdf8 0%, #818cf8 100%);
+  color: #0f172a;
+  font-weight: 600;
+}
+button.secondary {
+  background: rgba(148, 163, 184, 0.2);
+  color: #e5e7eb;
+}
+button:hover {
+  transform: translateY(-1px);
+  box-shadow: 0 10px 25px rgba(15, 23, 42, 0.25);
+}
+.status-text {
+  color: #eab308;
+  font-size: 0.9rem;
+}
+#transcript-container {
+  max-height: 420px;
+  overflow: auto;
+  border-radius: 12px;
+  background: rgba(15, 23, 42, 0.4);
+  border: 1px solid rgba(148, 163, 184, 0.15);
+}
+#transcript-list {
+  list-style: none;
+  padding: 0;
+  margin: 0;
+}
+.utterance-item {
+  padding: 0.85rem 1rem;
+  border-bottom: 1px solid rgba(148, 163, 184, 0.1);
+  transition: background 0.2s ease;
+}
+.utterance-item:last-child {
+  border-bottom: none;
+}
+.utterance-item.active {
+  background: rgba(56, 189, 248, 0.15);
+  border-left: 3px solid #38bdf8;
+}
+.utterance-header {
+  display: flex;
+  align-items: center;
+  gap: 0.75rem;
+}
+.timestamp {
+  font-size: 0.8rem;
+  color: #94a3b8;
+  min-width: 70px;
+}
+.speaker-tag {
+  font-size: 0.75rem;
+  padding: 0.1rem 0.5rem;
+  border-radius: 999px;
+  background: rgba(129, 140, 248, 0.2);
+}
+.utterance-actions {
+  margin-left: auto;
+  display: flex;
+  gap: 0.5rem;
+}
+.edit-btn {
+  background: rgba(148, 163, 184, 0.2);
+  color: #e5e7eb;
+  padding: 0.3rem 0.6rem;
+  font-size: 0.85rem;
+}
+.utterance-text {
+  margin-top: 0.4rem;
+  line-height: 1.5;
+}
+.edit-area {
+  margin-top: 0.6rem;
+  display: grid;
+  gap: 0.5rem;
+}
+.edit-area textarea {
+  width: 100%;
+}
+.edit-controls {
+  display: flex;
+  gap: 0.5rem;
+}
+.summary {
+  min-height: 120px;
+  background: rgba(15, 23, 42, 0.5);
+  border-radius: 12px;
+  padding: 1rem;
+  border: 1px solid rgba(148, 163, 184, 0.15);
+  white-space: pre-wrap;
+}
+.export-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+  gap: 0.75rem;
+  align-items: end;
+}
+#diarization-metrics,
+#speaker-breakdown {
+  display: grid;
+  gap: 0.75rem;
+}
+.metric-card {
+  padding: 0.75rem;
+  border-radius: 12px;
+  background: rgba(15, 23, 42, 0.5);
+  border: 1px solid rgba(148, 163, 184, 0.1);
+}
+.hidden {
+  display: none !important;
+}
+@media (max-width: 1100px) {
+  .app-shell {
+    grid-template-columns: 1fr;
+  }
+  .sidebar {
+    order: 2;
+  }
+  .content {
+    order: 1;
+  }
+}

requirements.txt CHANGED Viewed

@@ -1,7 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-altair
-pandas
-streamlit
 numpy<2.0
 soundfile
 onnxruntime
@@ -16,4 +13,9 @@ ffmpeg-python
 feedparser
 sherpa_onnx
 huggingface_hub
-faiss-cpu

 --extra-index-url https://download.pytorch.org/whl/cpu
 numpy<2.0
 soundfile
 onnxruntime
 feedparser
 sherpa_onnx
 huggingface_hub
+faiss-cpu
+fastapi
+uvicorn[standard]
+python-multipart
+jinja2
+aiofiles

src/asr.py CHANGED Viewed

@@ -1,11 +1,12 @@
 # asr.py
 import numpy as np
 import soundfile as sf
 from scipy.signal import resample_poly
-import re
-from typing import Optional, Tuple, List
-import tempfile
-import os
 # Lazy / optional imports: guard heavy or optional ASR backends
 try:
@@ -20,11 +21,7 @@ except Exception:
     MoonshineOnnxModel = None
     load_tokenizer = None
-from utils import s2tw_converter, load_sensevoice_model
-import re
-from typing import Optional, Tuple, List
-import tempfile
-import os
 SAMPLING_RATE = 16000
 CHUNK_SIZE = 512
@@ -44,8 +41,8 @@ def transcribe_file(
     model_name: str,
     backend: str = "moonshine",
     language: str = "auto",
-    textnorm: str = "withitn"
-) -> Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]]]:
     """
     Transcribe audio file using specified backend.

 # asr.py
+import os
+import re
+import tempfile
+from typing import Iterable, List, Optional, Tuple
 import numpy as np
 import soundfile as sf
 from scipy.signal import resample_poly
 # Lazy / optional imports: guard heavy or optional ASR backends
 try:
     MoonshineOnnxModel = None
     load_tokenizer = None
+from .utils import load_sensevoice_model, s2tw_converter
 SAMPLING_RATE = 16000
 CHUNK_SIZE = 512
     model_name: str,
     backend: str = "moonshine",
     language: str = "auto",
+    textnorm: str = "withitn",
+) -> Iterable[Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]]]]:
     """
     Transcribe audio file using specified backend.

src/diarization.py CHANGED Viewed

@@ -17,12 +17,11 @@ import numpy as np
 import sherpa_onnx
 from pathlib import Path
 from typing import List, Tuple, Optional, Callable, Dict, Any
-import streamlit as st
 import logging
-from utils import get_writable_model_dir
-from utils import num_vcpus
 from huggingface_hub import hf_hub_download
 import shutil
 # Import the improved diarization pipeline (robust: search repo tree)
 try:
@@ -95,11 +94,11 @@ def download_diarization_models():
     repo_id = "csukuangfj/speaker-embedding-models"
     filename = "3dspeaker_speech_campplus_sv_zh_en_16k-common_advanced.onnx"
     embedding_model = models_dir / filename
-    st.info(f"Model cache directory: {models_dir}")
     try:
         # Download using huggingface_hub if not present
         if not embedding_model.exists():
-            st.info("📥 Downloading eres2netv2 Chinese speaker model from HuggingFace (29MB)...")
             downloaded_path = hf_hub_download(
                 repo_id=repo_id,
                 filename=filename,
@@ -111,10 +110,10 @@ def download_diarization_models():
             # Move/copy to expected location if needed
             if Path(downloaded_path) != embedding_model:
                 shutil.copy(downloaded_path, embedding_model)
-            st.success("✅ eres2netv2 Chinese embedding model downloaded!")
         return str(embedding_model), True
     except Exception as e:
-        st.error(f"❌ Failed to download diarization models: {e}")
         return None, False
 def init_speaker_embedding_extractor(
@@ -137,26 +136,26 @@ def init_speaker_embedding_extractor(
         embedding_model, success = download_diarization_models()
         if not success:
             return None
         # Create embedding extractor config
         embedding_config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
             model=embedding_model,
             num_threads=num_vcpus
         )
         # Initialize embedding extractor
         embedding_extractor = sherpa_onnx.SpeakerEmbeddingExtractor(embedding_config)
         # Store clustering parameters separately
         config_dict = {
             'cluster_threshold': cluster_threshold,
             'num_speakers': num_speakers
         }
         return embedding_extractor, config_dict
     except Exception as e:
-        st.error(f"❌ Failed to initialize speaker embedding extractor: {e}")
         return None
 def perform_speaker_diarization_on_utterances(
@@ -195,19 +194,13 @@ def perform_speaker_diarization_on_utterances(
         # Check sample rate
         if sample_rate != 16000:
             warning_msg = f"⚠️ Audio sample rate is {sample_rate}Hz, but 16kHz is optimal for diarization"
-            if hasattr(st, '_is_running_with_streamlit') and st._is_running_with_streamlit:
-                st.warning(warning_msg)
-            print(warning_msg)
         if not utterances:
-            if hasattr(st, '_is_running_with_streamlit') and st._is_running_with_streamlit:
-                st.warning("⚠️ No utterances provided for diarization")
-            print("⚠️ No utterances provided for diarization")
             return []
-        if hasattr(st, '_is_running_with_streamlit') and st._is_running_with_streamlit:
-            st.info(f"🎭 Extracting embeddings from {len(utterances)} utterance segments...")
-        print(f"🎭 Extracting embeddings from {len(utterances)} utterance segments...")
         # Extract embeddings for each utterance segment
         embeddings = []
@@ -258,12 +251,12 @@ def perform_speaker_diarization_on_utterances(
                 continue
         if not embeddings:
-            st.error("❌ No valid embeddings extracted")
             print(f"❌ DEBUG: Failed to extract any embeddings from {len(utterances)} utterances")
             return []
         print(f"✅ DEBUG: Extracted {len(embeddings)} embeddings for clustering")
-        st.info(f"✅ Extracted {len(embeddings)} embeddings, performing clustering...")
         # Convert embeddings to numpy array
         embeddings_array = np.array(embeddings)
@@ -272,7 +265,7 @@ def perform_speaker_diarization_on_utterances(
         # Use enhanced diarization if available
         if ENHANCED_DIARIZATION_AVAILABLE:
             print("🚀 Using enhanced diarization with adaptive clustering...")
-            st.info("🚀 Using enhanced adaptive clustering...")
             # Prepare utterances dict format for enhanced pipeline
             utterances_dict = []
@@ -300,11 +293,11 @@ def perform_speaker_diarization_on_utterances(
                 quality_msg = f"🎯 Diarization Quality: {confidence} confidence ({quality})"
                 if quality in ['excellent', 'good']:
-                    st.success(quality_msg)
                 elif quality == 'fair':
-                    st.warning(quality_msg)
                 else:
-                    st.error(quality_msg)
                 print(f"✅ Enhanced diarization quality report:")
                 print(f"   - Quality: {quality}")
@@ -314,7 +307,7 @@ def perform_speaker_diarization_on_utterances(
                 print(f"   - Speakers detected: {n_speakers}")
                 if quality_report['recommendations']:
-                    st.info("💡 " + "; ".join(quality_report['recommendations']))
                 # Convert back to tuple format
                 diarization_result = []
@@ -325,17 +318,17 @@ def perform_speaker_diarization_on_utterances(
                     progress_callback(1.0)  # 100% complete
                 print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
-                st.success(f"🎭 Enhanced clustering completed! Detected {n_speakers} speakers with {confidence} confidence")
                 return diarization_result
             except Exception as e:
-                st.error(f"❌ Enhanced diarization failed: {e}")
                 print(f"❌ Enhanced diarization failed: {e}")
                 # Fall back to original clustering
         # Fallback to original clustering
-        st.warning("⚠️ Using fallback clustering")
         print("⚠️ Using fallback clustering")
         # >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
@@ -349,8 +342,6 @@ def perform_speaker_diarization_on_utterances(
         print(error_msg)
         import traceback
         traceback.print_exc()
-        if hasattr(st, '_is_running_with_streamlit') and st._is_running_with_streamlit:
-            st.error(error_msg)
         return []
 def merge_transcription_with_diarization(
@@ -555,7 +546,7 @@ def faiss_clustering(embeddings: np.ndarray,
     num_speakers = len(set(labels))
     print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
-    st.success(f"🎭 FAISS clustering completed! Detected {num_speakers} speakers")
     return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]

 import sherpa_onnx
 from pathlib import Path
 from typing import List, Tuple, Optional, Callable, Dict, Any
 import logging
+from .utils import get_writable_model_dir, num_vcpus
 from huggingface_hub import hf_hub_download
 import shutil
+from sklearn.metrics import silhouette_score
 # Import the improved diarization pipeline (robust: search repo tree)
 try:
     repo_id = "csukuangfj/speaker-embedding-models"
     filename = "3dspeaker_speech_campplus_sv_zh_en_16k-common_advanced.onnx"
     embedding_model = models_dir / filename
+    logger.info(f"Model cache directory: {models_dir}")
     try:
         # Download using huggingface_hub if not present
         if not embedding_model.exists():
+            logger.info("📥 Downloading eres2netv2 Chinese speaker model from HuggingFace (29MB)...")
             downloaded_path = hf_hub_download(
                 repo_id=repo_id,
                 filename=filename,
             # Move/copy to expected location if needed
             if Path(downloaded_path) != embedding_model:
                 shutil.copy(downloaded_path, embedding_model)
+            logger.info("✅ eres2netv2 Chinese embedding model downloaded!")
         return str(embedding_model), True
     except Exception as e:
+        logger.error(f"❌ Failed to download diarization models: {e}")
         return None, False
 def init_speaker_embedding_extractor(
         embedding_model, success = download_diarization_models()
         if not success:
             return None
         # Create embedding extractor config
         embedding_config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
             model=embedding_model,
             num_threads=num_vcpus
         )
         # Initialize embedding extractor
         embedding_extractor = sherpa_onnx.SpeakerEmbeddingExtractor(embedding_config)
         # Store clustering parameters separately
         config_dict = {
             'cluster_threshold': cluster_threshold,
             'num_speakers': num_speakers
         }
         return embedding_extractor, config_dict
     except Exception as e:
+        logger.error(f"❌ Failed to initialize speaker embedding extractor: {e}")
         return None
 def perform_speaker_diarization_on_utterances(
         # Check sample rate
         if sample_rate != 16000:
             warning_msg = f"⚠️ Audio sample rate is {sample_rate}Hz, but 16kHz is optimal for diarization"
+            logger.warning(warning_msg)
         if not utterances:
+            logger.warning("⚠️ No utterances provided for diarization")
             return []
+        logger.info(f"🎭 Extracting embeddings from {len(utterances)} utterance segments...")
         # Extract embeddings for each utterance segment
         embeddings = []
                 continue
         if not embeddings:
+            logger.error("❌ No valid embeddings extracted")
             print(f"❌ DEBUG: Failed to extract any embeddings from {len(utterances)} utterances")
             return []
         print(f"✅ DEBUG: Extracted {len(embeddings)} embeddings for clustering")
+        logger.info(f"✅ Extracted {len(embeddings)} embeddings, performing clustering...")
         # Convert embeddings to numpy array
         embeddings_array = np.array(embeddings)
         # Use enhanced diarization if available
         if ENHANCED_DIARIZATION_AVAILABLE:
             print("🚀 Using enhanced diarization with adaptive clustering...")
+            logger.info("🚀 Using enhanced adaptive clustering...")
             # Prepare utterances dict format for enhanced pipeline
             utterances_dict = []
                 quality_msg = f"🎯 Diarization Quality: {confidence} confidence ({quality})"
                 if quality in ['excellent', 'good']:
+                    logger.info(quality_msg)
                 elif quality == 'fair':
+                    logger.warning(quality_msg)
                 else:
+                    logger.error(quality_msg)
                 print(f"✅ Enhanced diarization quality report:")
                 print(f"   - Quality: {quality}")
                 print(f"   - Speakers detected: {n_speakers}")
                 if quality_report['recommendations']:
+                    logger.info("💡 " + "; ".join(quality_report['recommendations']))
                 # Convert back to tuple format
                 diarization_result = []
                     progress_callback(1.0)  # 100% complete
                 print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
+                logger.info(f"🎭 Enhanced clustering completed! Detected {n_speakers} speakers with {confidence} confidence")
                 return diarization_result
             except Exception as e:
+                logger.error(f"❌ Enhanced diarization failed: {e}")
                 print(f"❌ Enhanced diarization failed: {e}")
                 # Fall back to original clustering
         # Fallback to original clustering
+        logger.warning("⚠️ Using fallback clustering")
         print("⚠️ Using fallback clustering")
         # >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
         print(error_msg)
         import traceback
         traceback.print_exc()
         return []
 def merge_transcription_with_diarization(
     num_speakers = len(set(labels))
     print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
+    logger.info(f"🎭 FAISS clustering completed! Detected {num_speakers} speakers")
     return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]

src/server/__init__.py ADDED Viewed

File without changes

src/server/core/config.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from __future__ import annotations
+import os
+from pathlib import Path
+from functools import lru_cache
+BASE_DIR = Path(__file__).resolve().parents[3]
+STATIC_DIR = BASE_DIR / "static"
+AUDIO_DIR = STATIC_DIR / "audio"
+MODEL_CACHE_DIR = BASE_DIR / "tmp" / "models"
+FRONTEND_DIR = BASE_DIR / "frontend"
+TMP_DIR = BASE_DIR / "tmp"
+# Ensure required directories exist
+for directory in (STATIC_DIR, AUDIO_DIR, MODEL_CACHE_DIR, TMP_DIR, FRONTEND_DIR):
+    directory.mkdir(parents=True, exist_ok=True)
+class Settings:
+    app_name: str = "VoxSum Studio API"
+    static_dir: Path = STATIC_DIR
+    audio_dir: Path = AUDIO_DIR
+    frontend_dir: Path = FRONTEND_DIR
+    tmp_dir: Path = TMP_DIR
+    model_cache_dir: Path = MODEL_CACHE_DIR
+    max_audio_files: int = int(os.environ.get("VOXSUM_MAX_AUDIO_FILES", "20"))
+    transcription_chunk_size: int = 100
+@lru_cache(maxsize=1)
+def get_settings() -> Settings:
+    return Settings()

src/server/main.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from __future__ import annotations
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from .core.config import get_settings
+from .routers.api import router as api_router
+settings = get_settings()
+app = FastAPI(title=settings.app_name)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"] ,
+    allow_headers=["*"],
+)
+app.include_router(api_router)
+app.mount("/static", StaticFiles(directory=settings.static_dir), name="static")
+app.mount("/media", StaticFiles(directory=settings.audio_dir), name="media")
+app.mount("/", StaticFiles(directory=settings.frontend_dir, html=True), name="frontend")
+@app.get("/health")
+def healthcheck() -> dict[str, str]:
+    return {"status": "ok"}

src/server/routers/api.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+from fastapi import APIRouter, File, Form, HTTPException, UploadFile
+from fastapi.responses import StreamingResponse
+from ..models.export import SummaryExportRequest, TranscriptExportRequest
+from ..models.summarization import SummaryRequest
+from ..models.transcription import TranscriptionRequest
+from ..core.config import get_settings
+from ..services import config_service, export_service, podcast_service
+from ..services.asr_service import iter_transcription_events
+from ..services.file_service import save_upload_file, store_audio_file
+from ..services.summarization_service import iter_summary_events
+router = APIRouter(prefix="/api")
+@router.get("/config/models")
+def fetch_model_catalog():
+    return config_service.get_model_catalog()
+@router.post("/transcribe")
+def transcribe_audio(
+    audio: UploadFile | None = File(default=None),
+    options: str = Form("{}"),
+    source: str | None = Form(default=None),
+):
+    payload = TranscriptionRequest(**json.loads(options or "{}"))
+    cleanup_temp = False
+    if audio is not None:
+        temp_path = save_upload_file(audio)
+        _, audio_url = store_audio_file(temp_path)
+        cleanup_temp = True
+    elif source:
+        filename = Path(source).name
+        candidate_path = get_settings().audio_dir / filename
+        if not candidate_path.exists():
+            raise HTTPException(status_code=404, detail="Audio source not found")
+        temp_path = candidate_path
+        audio_url = source
+    else:
+        raise HTTPException(status_code=400, detail="Either audio upload or source is required")
+    def event_stream():
+        try:
+            for event in iter_transcription_events(temp_path, audio_url, payload):
+                yield json.dumps(event, ensure_ascii=False) + "\n"
+        finally:
+            if cleanup_temp:
+                temp_path.unlink(missing_ok=True)
+    return StreamingResponse(event_stream(), media_type="application/x-ndjson")
+@router.post("/summarize")
+def summarize_text(request: SummaryRequest):
+    def event_stream():
+        for event in iter_summary_events(request):
+            yield json.dumps(event, ensure_ascii=False) + "\n"
+    return StreamingResponse(event_stream(), media_type="application/x-ndjson")
+@router.get("/podcast/search")
+def search_podcast(query: str):
+    return podcast_service.search_series(query)
+@router.get("/podcast/episodes")
+def get_podcast_episodes(feed_url: str):
+    return podcast_service.list_episodes(feed_url)
+@router.post("/podcast/download")
+def download_episode(payload: dict):
+    audio_url = payload.get("audioUrl") or payload.get("audio_url")
+    title = payload.get("title", "Episode")
+    if not audio_url:
+        raise HTTPException(status_code=400, detail="audioUrl is required")
+    return podcast_service.download_episode(audio_url, title)
+@router.post("/youtube/fetch")
+def fetch_youtube_audio(payload: dict):
+    url = payload.get("url") or payload.get("youtubeUrl")
+    if not url:
+        raise HTTPException(status_code=400, detail="url is required")
+    return podcast_service.fetch_youtube_audio(url)
+@router.post("/export/transcript")
+def export_transcript(payload: TranscriptExportRequest):
+    content, filename, mime_type = export_service.generate_transcript_export(payload)
+    return StreamingResponse(
+        iter([content.encode("utf-8")]),
+        media_type=mime_type,
+        headers={"Content-Disposition": f"attachment; filename={filename}"},
+    )
+@router.post("/export/summary")
+def export_summary(payload: SummaryExportRequest):
+    content, filename, mime_type = export_service.generate_summary_export(payload)
+    return StreamingResponse(
+        iter([content.encode("utf-8")]),
+        media_type=mime_type,
+        headers={"Content-Disposition": f"attachment; filename={filename}"},
+    )

src/server/services/asr_service.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+import soundfile as sf
+from fastapi import HTTPException
+from src.asr import transcribe_file
+from src.diarization import (
+    get_diarization_stats,
+    init_speaker_embedding_extractor,
+    merge_consecutive_utterances,
+    merge_transcription_with_diarization,
+    perform_speaker_diarization_on_utterances,
+)
+from src.utils import sensevoice_models
+from ..core.config import get_settings
+from ..models.transcription import DiarizationOptions, TranscriptionRequest
+settings = get_settings()
+def _serialize_utterance(utt: Tuple[float, float, str], speaker: Optional[int] = None) -> Dict[str, object]:
+    start, end, text = utt
+    payload: Dict[str, object] = {
+        "start": round(float(start), 3),
+        "end": round(float(end), 3),
+        "text": text,
+    }
+    if speaker is not None:
+        payload["speaker"] = int(speaker)
+    return payload
+def _prepare_model_name(options: TranscriptionRequest) -> str:
+    if options.backend == "sensevoice":
+        # sensevoice_models stores map from friendly name to repo id
+        return sensevoice_models.get(options.model_name, options.model_name)
+    return options.model_name
+def iter_transcription_events(
+    audio_path: Path,
+    audio_url: str,
+    options: TranscriptionRequest,
+) -> Iterable[Dict[str, object]]:
+    model_name = _prepare_model_name(options)
+    try:
+        generator = transcribe_file(
+            audio_path=str(audio_path),
+            vad_threshold=options.vad_threshold,
+            model_name=model_name,
+            backend=options.backend,
+            language=options.language,
+            textnorm=options.textnorm,
+        )
+        yield {
+            "type": "ready",
+            "audioUrl": audio_url,
+            "backend": options.backend,
+            "model": model_name,
+        }
+        final_utterances: List[Tuple[float, float, str]] = []
+        for current_utterance, all_utterances in generator:
+            if current_utterance:
+                start, end, text = current_utterance
+                yield {
+                    "type": "utterance",
+                    "utterance": _serialize_utterance((start, end, text)),
+                    "index": len(all_utterances) - 1,
+                }
+            final_utterances = list(all_utterances)
+        # Final event with transcript and optional diarization
+        diarization_payload = None
+        if options.diarization.enable:
+            diarization_payload = _run_diarization(audio_path, final_utterances, options.diarization)
+        transcript_text = "\n".join([utt[2] for utt in final_utterances])
+        yield {
+            "type": "complete",
+            "utterances": [_serialize_utterance(utt) for utt in final_utterances],
+            "transcript": transcript_text,
+            "diarization": diarization_payload,
+        }
+    except Exception as exc:  # pragma: no cover
+        raise HTTPException(status_code=500, detail=f"Transcription failed: {exc}")
+def _run_diarization(
+    audio_path: Path,
+    utterances: List[Tuple[float, float, str]],
+    options: DiarizationOptions,
+) -> Optional[Dict[str, object]]:
+    if not utterances:
+        return None
+    extractor_result = init_speaker_embedding_extractor(
+        cluster_threshold=options.cluster_threshold,
+        num_speakers=options.num_speakers,
+    )
+    if not extractor_result:
+        return None
+    embedding_extractor, config_dict = extractor_result
+    audio, sample_rate = sf.read(str(audio_path), dtype="float32")
+    if audio.ndim > 1:
+        audio = audio.mean(axis=1)
+    if sample_rate != 16000:
+        # Lazy import to avoid mandatory dependency during module import
+        from scipy.signal import resample
+        target_num_samples = int(len(audio) * 16000 / sample_rate)
+        audio = resample(audio, target_num_samples)
+        sample_rate = 16000
+    diarization_segments = perform_speaker_diarization_on_utterances(
+        audio=audio,
+        sample_rate=sample_rate,
+        utterances=utterances,
+        embedding_extractor=embedding_extractor,
+        config_dict=config_dict,
+        progress_callback=None,
+    )
+    if not diarization_segments:
+        return None
+    merged = merge_transcription_with_diarization(utterances, diarization_segments)
+    merged = merge_consecutive_utterances(merged, max_gap=1.0)
+    stats = get_diarization_stats(merged)
+    return {
+        "utterances": [
+            _serialize_utterance((start, end, text), speaker)
+            for start, end, text, speaker in merged
+        ],
+        "stats": stats,
+    }

src/server/services/config_service.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from __future__ import annotations
+from typing import Dict
+from src.utils import available_gguf_llms, model_names, sensevoice_models
+def get_model_catalog() -> Dict[str, object]:
+    return {
+        "moonshine": model_names,
+        "sensevoice": sensevoice_models,
+        "llms": {name: {"repo": repo, "filename": filename} for name, (repo, filename) in available_gguf_llms.items()},
+    }

src/server/services/export_service.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from __future__ import annotations
+from datetime import datetime
+from typing import Tuple
+from src.export_utils import (
+    SUBTITLE_FORMATS,
+    SUMMARY_FORMATS,
+    TRANSCRIPT_FORMATS,
+    export_plain_text,
+)
+from ..models.export import SummaryExportRequest, TranscriptExportRequest
+def _build_utterance_tuples(payload: TranscriptExportRequest):
+    utterances = [(u.start, u.end, u.text) for u in payload.utterances]
+    has_speakers = any(u.speaker is not None for u in payload.utterances)
+    utterances_with_speakers = None
+    if has_speakers:
+        utterances_with_speakers = [
+            (u.start, u.end, u.text, u.speaker if u.speaker is not None else 0)
+            for u in payload.utterances
+        ]
+    return utterances, utterances_with_speakers
+def generate_transcript_export(payload: TranscriptExportRequest) -> Tuple[str, str, str]:
+    utterances, utterances_with_speakers = _build_utterance_tuples(payload)
+    if payload.format in SUBTITLE_FORMATS:
+        fmt = SUBTITLE_FORMATS[payload.format]
+        content = fmt["function"](utterances, utterances_with_speakers)
+    elif payload.format in TRANSCRIPT_FORMATS:
+        fmt = TRANSCRIPT_FORMATS[payload.format]
+        if payload.format == "Plain Text":
+            content = export_plain_text(
+                utterances,
+                utterances_with_speakers,
+                include_timestamps=payload.include_timestamps,
+            )
+        else:
+            content = fmt["function"](utterances, utterances_with_speakers)
+    else:
+        raise ValueError(f"Unsupported transcript export format: {payload.format}")
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"transcript_{timestamp}{fmt['extension']}"
+    return content, filename, fmt["mime_type"]
+def generate_summary_export(payload: SummaryExportRequest) -> Tuple[str, str, str]:
+    if payload.format not in SUMMARY_FORMATS:
+        raise ValueError(f"Unsupported summary export format: {payload.format}")
+    fmt = SUMMARY_FORMATS[payload.format]
+    content = fmt["function"](payload.summary, payload.metadata)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"summary_{timestamp}{fmt['extension']}"
+    return content, filename, fmt["mime_type"]

src/server/services/file_service.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from __future__ import annotations
+import shutil
+import uuid
+from pathlib import Path
+from typing import Tuple
+from fastapi import UploadFile
+from ..core.config import get_settings
+settings = get_settings()
+def cleanup_old_audio_files(max_files: int | None = None) -> None:
+    """Remove old audio files from the static directory to save space."""
+    max_files = max_files or settings.max_audio_files
+    audio_dir = settings.audio_dir
+    audio_dir.mkdir(parents=True, exist_ok=True)
+    files = sorted(audio_dir.glob("*") , key=lambda f: f.stat().st_mtime if f.exists() else 0)
+    if len(files) <= max_files:
+        return
+    for old_file in files[:-max_files]:
+        try:
+            old_file.unlink()
+        except OSError:
+            continue
+def save_upload_file(upload: UploadFile) -> Path:
+    """Persist an UploadFile to the temporary directory and return its path."""
+    tmp_dir = settings.tmp_dir
+    tmp_dir.mkdir(parents=True, exist_ok=True)
+    suffix = Path(upload.filename or "audio").suffix or ".mp3"
+    temp_path = tmp_dir / f"upload_{uuid.uuid4().hex}{suffix}"
+    with temp_path.open("wb") as buffer:
+        shutil.copyfileobj(upload.file, buffer)
+    return temp_path
+def store_audio_file(audio_path: Path, prefix: str | None = None) -> Tuple[Path, str]:
+    """Copy an audio file to the public static folder and return the new path and URL."""
+    cleanup_old_audio_files()
+    prefix = prefix or "audio"
+    suffix = audio_path.suffix or ".mp3"
+    dest_filename = f"{prefix}_{uuid.uuid4().hex}{suffix}"
+    dest_path = settings.audio_dir / dest_filename
+    shutil.copy2(audio_path, dest_path)
+    url = f"/media/{dest_filename}"
+    return dest_path, url

src/server/services/podcast_service.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Dict, List, Optional
+from fastapi import HTTPException
+from src.podcast import (
+    download_podcast_audio,
+    fetch_audio,
+    fetch_episodes,
+    search_podcast_series,
+)
+from .file_service import store_audio_file
+def search_series(query: str) -> List[Dict[str, object]]:
+    return search_podcast_series(query)
+def list_episodes(feed_url: str) -> List[Dict[str, object]]:
+    return fetch_episodes(feed_url)
+def download_episode(audio_url: str, title: str) -> Dict[str, str]:
+    file_path, status = download_podcast_audio(audio_url, title, status="Podcast download")
+    if not file_path:
+        raise HTTPException(status_code=500, detail=status or "Download failed")
+    _, audio_url = store_audio_file(Path(file_path), prefix="podcast")
+    return {"audioUrl": audio_url, "status": status}
+def fetch_youtube_audio(youtube_url: str) -> Dict[str, str]:
+    audio_path, status = fetch_audio(youtube_url, status="YouTube fetch")
+    if not audio_path:
+        raise HTTPException(status_code=500, detail=status or "YouTube download failed")
+    _, audio_url = store_audio_file(Path(audio_path), prefix="youtube")
+    return {"audioUrl": audio_url, "status": status}

src/server/services/summarization_service.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from __future__ import annotations
+from typing import Dict, Iterable
+from fastapi import HTTPException
+from src.summarization import summarize_transcript
+from ..models.summarization import SummaryRequest
+def iter_summary_events(payload: SummaryRequest) -> Iterable[Dict[str, str]]:
+    try:
+        generator = summarize_transcript(
+            transcript=payload.transcript,
+            selected_gguf_model=payload.llm_model,
+            prompt_input=payload.prompt,
+        )
+        for chunk in generator:
+            yield {"type": "partial", "content": chunk}
+        yield {"type": "complete"}
+    except Exception as exc:  # pragma: no cover
+        raise HTTPException(status_code=500, detail=f"Summary failed: {exc}")

src/summarization.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # summarization.py
-from llama_cpp import Llama
-from utils import available_gguf_llms, s2tw_converter
 import time
 from functools import lru_cache
-import multiprocessing
-from utils import num_vcpus
 # Detect logical cores (vCPUs available to the container)
 print(f"Detected vCPUs: {num_vcpus}")

 # summarization.py
 import time
 from functools import lru_cache
+from llama_cpp import Llama
+from .utils import available_gguf_llms, num_vcpus, s2tw_converter
 # Detect logical cores (vCPUs available to the container)
 print(f"Detected vCPUs: {num_vcpus}")