re-implement in FastAPI
Browse files- Dockerfile +12 -32
- frontend/app.js +768 -0
- frontend/index.html +198 -0
- frontend/styles.css +448 -0
- requirements.txt +6 -4
- src/asr.py +8 -11
- src/diarization.py +26 -35
- src/server/__init__.py +0 -0
- src/server/core/config.py +33 -0
- src/server/main.py +31 -0
- src/server/routers/api.py +113 -0
- src/server/services/asr_service.py +149 -0
- src/server/services/config_service.py +13 -0
- src/server/services/export_service.py +60 -0
- src/server/services/file_service.py +57 -0
- src/server/services/podcast_service.py +41 -0
- src/server/services/summarization_service.py +26 -0
- src/summarization.py +4 -4
Dockerfile
CHANGED
|
@@ -13,49 +13,29 @@ RUN apt-get update && apt-get install -y \
|
|
| 13 |
libopenblas-dev \
|
| 14 |
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
|
| 16 |
-
# === CRITICAL FIX + PERFORMANCE OPTIMIZATIONS ===
|
| 17 |
-
# Set Streamlit to use temporary directories for ALL storage
|
| 18 |
ENV HOME=/tmp
|
| 19 |
-
ENV STREAMLIT_GLOBAL_DEVELOPMENT_MODE=false
|
| 20 |
-
ENV STREAMLIT_GLOBAL_DATA_PATH=/tmp
|
| 21 |
-
ENV STREAMLIT_CONFIG_DIR=/tmp/.streamlit
|
| 22 |
ENV HF_HOME=/tmp/huggingface
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
# Create directories
|
| 25 |
-
RUN mkdir -p /tmp
|
| 26 |
-
chmod -R 777 /tmp /app/static
|
| 27 |
-
|
| 28 |
-
# Create config file with proper settings for large file handling
|
| 29 |
-
RUN mkdir -p /tmp/.streamlit && \
|
| 30 |
-
cat <<EOF > /tmp/.streamlit/config.toml
|
| 31 |
-
[browser]
|
| 32 |
-
gatherUsageStats = false
|
| 33 |
-
|
| 34 |
-
[server]
|
| 35 |
-
enableCORS = false
|
| 36 |
-
enableXsrfProtection = false
|
| 37 |
-
maxUploadSize = 500
|
| 38 |
-
maxMessageSize = 500
|
| 39 |
-
|
| 40 |
-
[runner]
|
| 41 |
-
maxCachedEntries = 1000
|
| 42 |
-
fastReruns = true
|
| 43 |
-
EOF
|
| 44 |
|
| 45 |
# Copy files
|
| 46 |
COPY requirements.txt ./
|
| 47 |
COPY src/ ./src/
|
| 48 |
COPY static/ ./static/
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# Install Python dependencies
|
| 51 |
RUN pip3 install --no-cache-dir -r requirements.txt
|
| 52 |
|
| 53 |
-
EXPOSE
|
| 54 |
|
| 55 |
-
HEALTHCHECK CMD curl --fail http://localhost:
|
| 56 |
|
| 57 |
-
ENTRYPOINT ["
|
| 58 |
-
|
| 59 |
-
"--server.address=0.0.0.0", \
|
| 60 |
-
"--server.maxUploadSize=500", \
|
| 61 |
-
"--server.maxMessageSize=500"]
|
|
|
|
| 13 |
libopenblas-dev \
|
| 14 |
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
|
|
|
|
|
|
|
| 16 |
ENV HOME=/tmp
|
|
|
|
|
|
|
|
|
|
| 17 |
ENV HF_HOME=/tmp/huggingface
|
| 18 |
+
ENV PYTHONUNBUFFERED=1
|
| 19 |
+
ENV PYTHONPATH="/app/src"
|
| 20 |
+
ENV PORT=7860
|
| 21 |
|
| 22 |
+
# Create writable directories used at runtime
|
| 23 |
+
RUN mkdir -p /tmp/huggingface /app/static /app/static/audio /app/tmp && \
|
| 24 |
+
chmod -R 777 /tmp /app/static /app/tmp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# Copy files
|
| 27 |
COPY requirements.txt ./
|
| 28 |
COPY src/ ./src/
|
| 29 |
COPY static/ ./static/
|
| 30 |
+
COPY frontend/ ./frontend/
|
| 31 |
+
COPY models/ ./models/
|
| 32 |
|
| 33 |
# Install Python dependencies
|
| 34 |
RUN pip3 install --no-cache-dir -r requirements.txt
|
| 35 |
|
| 36 |
+
EXPOSE 7860
|
| 37 |
|
| 38 |
+
HEALTHCHECK CMD curl --fail http://localhost:7860/health || exit 1
|
| 39 |
|
| 40 |
+
ENTRYPOINT ["python", "-m", "uvicorn", "src.server.main:app"]
|
| 41 |
+
CMD ["--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
|
|
frontend/app.js
ADDED
|
@@ -0,0 +1,768 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const state = {
|
| 2 |
+
config: { moonshine: {}, sensevoice: {}, llms: {} },
|
| 3 |
+
backend: 'sensevoice',
|
| 4 |
+
utterances: [],
|
| 5 |
+
diarizedUtterances: null,
|
| 6 |
+
diarizationStats: null,
|
| 7 |
+
summary: '',
|
| 8 |
+
audioUrl: null,
|
| 9 |
+
sourcePath: null,
|
| 10 |
+
uploadedFile: null,
|
| 11 |
+
transcribing: false,
|
| 12 |
+
summarizing: false,
|
| 13 |
+
};
|
| 14 |
+
|
| 15 |
+
const elements = {
|
| 16 |
+
backendSelect: document.getElementById('backend-select'),
|
| 17 |
+
modelSelect: document.getElementById('model-select'),
|
| 18 |
+
llmSelect: document.getElementById('llm-select'),
|
| 19 |
+
promptInput: document.getElementById('prompt-input'),
|
| 20 |
+
vadSlider: document.getElementById('vad-threshold'),
|
| 21 |
+
vadValue: document.getElementById('vad-value'),
|
| 22 |
+
diarizationToggle: document.getElementById('diarization-toggle'),
|
| 23 |
+
diarizationSettings: document.getElementById('diarization-settings'),
|
| 24 |
+
numSpeakers: document.getElementById('num-speakers'),
|
| 25 |
+
clusterSlider: document.getElementById('cluster-threshold'),
|
| 26 |
+
clusterValue: document.getElementById('cluster-value'),
|
| 27 |
+
sensevoiceOptions: document.getElementById('sensevoice-options'),
|
| 28 |
+
sensevoiceLanguage: document.getElementById('sensevoice-language'),
|
| 29 |
+
transcribeBtn: document.getElementById('transcribe-btn'),
|
| 30 |
+
summaryBtn: document.getElementById('summary-btn'),
|
| 31 |
+
statusText: document.getElementById('status-text'),
|
| 32 |
+
audioPlayer: document.getElementById('audio-player'),
|
| 33 |
+
transcriptList: document.getElementById('transcript-list'),
|
| 34 |
+
transcriptTemplate: document.getElementById('utterance-template'),
|
| 35 |
+
utteranceCount: document.getElementById('utterance-count'),
|
| 36 |
+
summaryOutput: document.getElementById('summary-output'),
|
| 37 |
+
diarizationPanel: document.getElementById('diarization-summary'),
|
| 38 |
+
diarizationMetrics: document.getElementById('diarization-metrics'),
|
| 39 |
+
speakerBreakdown: document.getElementById('speaker-breakdown'),
|
| 40 |
+
transcriptFormat: document.getElementById('transcript-format'),
|
| 41 |
+
summaryFormat: document.getElementById('summary-format'),
|
| 42 |
+
exportTranscriptBtn: document.getElementById('export-transcript'),
|
| 43 |
+
exportSummaryBtn: document.getElementById('export-summary'),
|
| 44 |
+
includeTimestamps: document.getElementById('include-timestamps'),
|
| 45 |
+
fileInput: document.getElementById('file-input'),
|
| 46 |
+
youtubeUrl: document.getElementById('youtube-url'),
|
| 47 |
+
youtubeFetch: document.getElementById('youtube-fetch'),
|
| 48 |
+
podcastQuery: document.getElementById('podcast-query'),
|
| 49 |
+
podcastSearch: document.getElementById('podcast-search'),
|
| 50 |
+
podcastResults: document.getElementById('podcast-results'),
|
| 51 |
+
episodeResults: document.getElementById('episode-results'),
|
| 52 |
+
};
|
| 53 |
+
|
| 54 |
+
const TRANSCRIPT_FORMATS = [
|
| 55 |
+
'SRT (SubRip)',
|
| 56 |
+
'VTT (WebVTT)',
|
| 57 |
+
'ASS (Advanced SubStation Alpha)',
|
| 58 |
+
'Plain Text',
|
| 59 |
+
'JSON',
|
| 60 |
+
'ELAN (EAF)',
|
| 61 |
+
];
|
| 62 |
+
|
| 63 |
+
const SUMMARY_FORMATS = ['Markdown', 'Plain Text'];
|
| 64 |
+
|
| 65 |
+
let activeTab = 'podcast-tab';
|
| 66 |
+
let activeUtteranceIndex = -1;
|
| 67 |
+
|
| 68 |
+
function setStatus(message, tone = 'info') {
|
| 69 |
+
elements.statusText.textContent = message;
|
| 70 |
+
elements.statusText.dataset.tone = tone;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
function formatTime(seconds) {
|
| 74 |
+
const mins = Math.floor(seconds / 60);
|
| 75 |
+
const secs = Math.floor(seconds % 60).toString().padStart(2, '0');
|
| 76 |
+
return `${mins}:${secs}`;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
function setListEmpty(container, message) {
|
| 80 |
+
if (!container) return;
|
| 81 |
+
container.innerHTML = `<div class="empty-state">${message}</div>`;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
async function fetchConfig() {
|
| 85 |
+
try {
|
| 86 |
+
const res = await fetch('/api/config/models');
|
| 87 |
+
if (!res.ok) throw new Error('Failed to fetch model catalog');
|
| 88 |
+
state.config = await res.json();
|
| 89 |
+
populateModelSelect();
|
| 90 |
+
populateLLMSelect();
|
| 91 |
+
populateExportSelects();
|
| 92 |
+
} catch (err) {
|
| 93 |
+
console.error(err);
|
| 94 |
+
setStatus(err.message, 'error');
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
function populateModelSelect() {
|
| 99 |
+
const backend = state.backend;
|
| 100 |
+
elements.modelSelect.innerHTML = '';
|
| 101 |
+
const models = backend === 'moonshine' ? state.config.moonshine : state.config.sensevoice;
|
| 102 |
+
Object.entries(models).forEach(([label, value]) => {
|
| 103 |
+
const option = document.createElement('option');
|
| 104 |
+
option.value = value;
|
| 105 |
+
option.textContent = label;
|
| 106 |
+
elements.modelSelect.appendChild(option);
|
| 107 |
+
});
|
| 108 |
+
if (elements.modelSelect.options.length > 0) {
|
| 109 |
+
elements.modelSelect.selectedIndex = 0;
|
| 110 |
+
}
|
| 111 |
+
elements.sensevoiceOptions.classList.toggle('hidden', backend !== 'sensevoice');
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
function populateLLMSelect() {
|
| 115 |
+
elements.llmSelect.innerHTML = '';
|
| 116 |
+
Object.keys(state.config.llms).forEach((name) => {
|
| 117 |
+
const option = document.createElement('option');
|
| 118 |
+
option.value = name;
|
| 119 |
+
option.textContent = name;
|
| 120 |
+
elements.llmSelect.appendChild(option);
|
| 121 |
+
});
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
function populateExportSelects() {
|
| 125 |
+
elements.transcriptFormat.innerHTML = '';
|
| 126 |
+
TRANSCRIPT_FORMATS.forEach((fmt) => {
|
| 127 |
+
const option = document.createElement('option');
|
| 128 |
+
option.value = fmt;
|
| 129 |
+
option.textContent = fmt;
|
| 130 |
+
elements.transcriptFormat.appendChild(option);
|
| 131 |
+
});
|
| 132 |
+
|
| 133 |
+
elements.summaryFormat.innerHTML = '';
|
| 134 |
+
SUMMARY_FORMATS.forEach((fmt) => {
|
| 135 |
+
const option = document.createElement('option');
|
| 136 |
+
option.value = fmt;
|
| 137 |
+
option.textContent = fmt;
|
| 138 |
+
elements.summaryFormat.appendChild(option);
|
| 139 |
+
});
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
function initTabs() {
|
| 143 |
+
document.querySelectorAll('.tab').forEach((tab) => {
|
| 144 |
+
tab.addEventListener('click', () => {
|
| 145 |
+
if (tab.dataset.target === activeTab) return;
|
| 146 |
+
document.querySelectorAll('.tab').forEach((btn) => btn.classList.remove('active'));
|
| 147 |
+
document.querySelectorAll('.tab-panel').forEach((panel) => panel.classList.remove('active'));
|
| 148 |
+
tab.classList.add('active');
|
| 149 |
+
document.getElementById(tab.dataset.target).classList.add('active');
|
| 150 |
+
activeTab = tab.dataset.target;
|
| 151 |
+
});
|
| 152 |
+
});
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
function initSidebarInteractions() {
|
| 156 |
+
elements.backendSelect.addEventListener('change', () => {
|
| 157 |
+
state.backend = elements.backendSelect.value;
|
| 158 |
+
populateModelSelect();
|
| 159 |
+
});
|
| 160 |
+
|
| 161 |
+
elements.vadSlider.addEventListener('input', () => {
|
| 162 |
+
elements.vadValue.textContent = Number(elements.vadSlider.value).toFixed(2);
|
| 163 |
+
});
|
| 164 |
+
|
| 165 |
+
elements.diarizationToggle.addEventListener('change', () => {
|
| 166 |
+
elements.diarizationSettings.classList.toggle('hidden', !elements.diarizationToggle.checked);
|
| 167 |
+
});
|
| 168 |
+
|
| 169 |
+
elements.clusterSlider.addEventListener('input', () => {
|
| 170 |
+
elements.clusterValue.textContent = Number(elements.clusterSlider.value).toFixed(2);
|
| 171 |
+
});
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
function resetTranscriptionState() {
|
| 175 |
+
state.utterances = [];
|
| 176 |
+
state.diarizedUtterances = null;
|
| 177 |
+
state.diarizationStats = null;
|
| 178 |
+
activeUtteranceIndex = -1;
|
| 179 |
+
elements.transcriptList.innerHTML = '';
|
| 180 |
+
elements.utteranceCount.textContent = '';
|
| 181 |
+
elements.diarizationPanel.classList.add('hidden');
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
function prepareTranscriptionOptions() {
|
| 185 |
+
const textnormValue = document.querySelector('input[name="textnorm"]:checked')?.value || 'withitn';
|
| 186 |
+
return {
|
| 187 |
+
backend: state.backend,
|
| 188 |
+
model_name: elements.modelSelect.value,
|
| 189 |
+
vad_threshold: Number(elements.vadSlider.value),
|
| 190 |
+
language: state.backend === 'sensevoice' ? elements.sensevoiceLanguage.value : 'auto',
|
| 191 |
+
textnorm: textnormValue,
|
| 192 |
+
diarization: {
|
| 193 |
+
enable: elements.diarizationToggle.checked,
|
| 194 |
+
num_speakers: Number(elements.numSpeakers.value || -1),
|
| 195 |
+
cluster_threshold: Number(elements.clusterSlider.value),
|
| 196 |
+
},
|
| 197 |
+
};
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
async function handleTranscription() {
|
| 201 |
+
if (state.transcribing) return;
|
| 202 |
+
if (!state.uploadedFile && !state.audioUrl) {
|
| 203 |
+
setStatus('Upload or select an audio source first', 'warning');
|
| 204 |
+
return;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
resetTranscriptionState();
|
| 208 |
+
state.transcribing = true;
|
| 209 |
+
setStatus('Starting transcription...', 'info');
|
| 210 |
+
|
| 211 |
+
const formData = new FormData();
|
| 212 |
+
if (state.uploadedFile) {
|
| 213 |
+
formData.append('audio', state.uploadedFile, state.uploadedFile.name);
|
| 214 |
+
} else if (state.audioUrl) {
|
| 215 |
+
formData.append('source', state.audioUrl);
|
| 216 |
+
}
|
| 217 |
+
formData.append('options', JSON.stringify(prepareTranscriptionOptions()));
|
| 218 |
+
|
| 219 |
+
try {
|
| 220 |
+
const response = await fetch('/api/transcribe', {
|
| 221 |
+
method: 'POST',
|
| 222 |
+
body: formData,
|
| 223 |
+
});
|
| 224 |
+
if (!response.ok || !response.body) {
|
| 225 |
+
throw new Error('Transcription request failed');
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
const reader = response.body.getReader();
|
| 229 |
+
const decoder = new TextDecoder();
|
| 230 |
+
let buffer = '';
|
| 231 |
+
setStatus('Processing audio...', 'info');
|
| 232 |
+
|
| 233 |
+
while (true) {
|
| 234 |
+
const { done, value } = await reader.read();
|
| 235 |
+
if (done) break;
|
| 236 |
+
buffer += decoder.decode(value, { stream: true });
|
| 237 |
+
let lines = buffer.split('\n');
|
| 238 |
+
buffer = lines.pop();
|
| 239 |
+
for (const line of lines) {
|
| 240 |
+
if (!line.trim()) continue;
|
| 241 |
+
const event = JSON.parse(line);
|
| 242 |
+
handleTranscriptionEvent(event);
|
| 243 |
+
}
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
if (buffer.trim()) {
|
| 247 |
+
handleTranscriptionEvent(JSON.parse(buffer));
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
setStatus('Transcription complete', 'success');
|
| 251 |
+
} catch (err) {
|
| 252 |
+
console.error(err);
|
| 253 |
+
setStatus(err.message, 'error');
|
| 254 |
+
} finally {
|
| 255 |
+
state.transcribing = false;
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
function handleTranscriptionEvent(event) {
|
| 260 |
+
switch (event.type) {
|
| 261 |
+
case 'ready':
|
| 262 |
+
if (event.audioUrl) {
|
| 263 |
+
state.audioUrl = event.audioUrl;
|
| 264 |
+
elements.audioPlayer.src = event.audioUrl;
|
| 265 |
+
elements.audioPlayer.currentTime = 0;
|
| 266 |
+
}
|
| 267 |
+
break;
|
| 268 |
+
case 'utterance':
|
| 269 |
+
if (event.utterance) {
|
| 270 |
+
state.utterances.push(event.utterance);
|
| 271 |
+
renderTranscript();
|
| 272 |
+
}
|
| 273 |
+
break;
|
| 274 |
+
case 'complete':
|
| 275 |
+
if (event.diarization) {
|
| 276 |
+
state.diarizedUtterances = event.diarization.utterances || [];
|
| 277 |
+
state.diarizationStats = event.diarization.stats || null;
|
| 278 |
+
}
|
| 279 |
+
if (event.utterances) {
|
| 280 |
+
const diarized = state.diarizedUtterances?.length ? state.diarizedUtterances : null;
|
| 281 |
+
state.utterances = diarized
|
| 282 |
+
? diarized.map((utt, index) => ({
|
| 283 |
+
...(event.utterances[index] || {}),
|
| 284 |
+
...utt,
|
| 285 |
+
}))
|
| 286 |
+
: event.utterances;
|
| 287 |
+
} else if (state.diarizedUtterances?.length) {
|
| 288 |
+
state.utterances = state.diarizedUtterances;
|
| 289 |
+
}
|
| 290 |
+
renderTranscript();
|
| 291 |
+
renderDiarizationStats();
|
| 292 |
+
break;
|
| 293 |
+
case 'error':
|
| 294 |
+
setStatus(event.message || 'Transcription error', 'error');
|
| 295 |
+
break;
|
| 296 |
+
}
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
function renderTranscript() {
|
| 300 |
+
elements.transcriptList.innerHTML = '';
|
| 301 |
+
const fragment = document.createDocumentFragment();
|
| 302 |
+
state.utterances.forEach((utt, index) => {
|
| 303 |
+
const node = elements.transcriptTemplate.content.cloneNode(true);
|
| 304 |
+
const item = node.querySelector('.utterance-item');
|
| 305 |
+
item.dataset.index = index.toString();
|
| 306 |
+
item.dataset.start = utt.start;
|
| 307 |
+
item.dataset.end = utt.end;
|
| 308 |
+
|
| 309 |
+
node.querySelector('.timestamp').textContent = `[${formatTime(utt.start)}]`;
|
| 310 |
+
node.querySelector('.utterance-text').textContent = utt.text;
|
| 311 |
+
|
| 312 |
+
const speakerTag = node.querySelector('.speaker-tag');
|
| 313 |
+
if (typeof utt.speaker === 'number') {
|
| 314 |
+
speakerTag.textContent = `Speaker ${utt.speaker + 1}`;
|
| 315 |
+
speakerTag.classList.remove('hidden');
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
fragment.appendChild(node);
|
| 319 |
+
});
|
| 320 |
+
elements.transcriptList.appendChild(fragment);
|
| 321 |
+
elements.utteranceCount.textContent = `${state.utterances.length} segments`;
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
function renderDiarizationStats() {
|
| 325 |
+
if (!state.diarizationStats) {
|
| 326 |
+
elements.diarizationPanel.classList.add('hidden');
|
| 327 |
+
return;
|
| 328 |
+
}
|
| 329 |
+
elements.diarizationPanel.classList.remove('hidden');
|
| 330 |
+
const stats = state.diarizationStats;
|
| 331 |
+
|
| 332 |
+
elements.diarizationMetrics.innerHTML = '';
|
| 333 |
+
const metricsFragment = document.createDocumentFragment();
|
| 334 |
+
|
| 335 |
+
const totalCard = document.createElement('div');
|
| 336 |
+
totalCard.className = 'metric-card';
|
| 337 |
+
totalCard.innerHTML = `<strong>Total speakers:</strong> ${stats.total_speakers || 0}<br/><strong>Duration:</strong> ${stats.total_duration?.toFixed(1) || 0}s`;
|
| 338 |
+
metricsFragment.appendChild(totalCard);
|
| 339 |
+
elements.diarizationMetrics.appendChild(metricsFragment);
|
| 340 |
+
|
| 341 |
+
elements.speakerBreakdown.innerHTML = '';
|
| 342 |
+
const speakersFragment = document.createDocumentFragment();
|
| 343 |
+
Object.entries(stats.speakers || {}).forEach(([speakerId, info]) => {
|
| 344 |
+
const card = document.createElement('div');
|
| 345 |
+
card.className = 'metric-card';
|
| 346 |
+
card.innerHTML = `
|
| 347 |
+
<strong>Speaker ${Number(speakerId) + 1}</strong><br/>
|
| 348 |
+
Speaking time: ${info.speaking_time.toFixed(1)}s<br/>
|
| 349 |
+
Percentage: ${info.percentage.toFixed(1)}%<br/>
|
| 350 |
+
Utterances: ${info.utterances}<br/>
|
| 351 |
+
Avg length: ${info.avg_utterance_length.toFixed(1)}s
|
| 352 |
+
`;
|
| 353 |
+
speakersFragment.appendChild(card);
|
| 354 |
+
});
|
| 355 |
+
elements.speakerBreakdown.appendChild(speakersFragment);
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
function findActiveUtterance(currentTime) {
|
| 359 |
+
let left = 0;
|
| 360 |
+
let right = state.utterances.length - 1;
|
| 361 |
+
let match = -1;
|
| 362 |
+
while (left <= right) {
|
| 363 |
+
const mid = Math.floor((left + right) / 2);
|
| 364 |
+
const utt = state.utterances[mid];
|
| 365 |
+
if (currentTime >= utt.start && currentTime < utt.end) {
|
| 366 |
+
return mid;
|
| 367 |
+
}
|
| 368 |
+
if (currentTime < utt.start) {
|
| 369 |
+
right = mid - 1;
|
| 370 |
+
} else {
|
| 371 |
+
match = mid;
|
| 372 |
+
left = mid + 1;
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
return match;
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
function updateActiveUtterance(index) {
|
| 379 |
+
if (index === activeUtteranceIndex) return;
|
| 380 |
+
const previous = elements.transcriptList.querySelector('.utterance-item.active');
|
| 381 |
+
if (previous) previous.classList.remove('active');
|
| 382 |
+
const current = elements.transcriptList.querySelector(`.utterance-item[data-index="${index}"]`);
|
| 383 |
+
if (current) {
|
| 384 |
+
current.classList.add('active');
|
| 385 |
+
current.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
| 386 |
+
}
|
| 387 |
+
activeUtteranceIndex = index;
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
function initAudioInteractions() {
|
| 391 |
+
elements.audioPlayer.addEventListener('timeupdate', () => {
|
| 392 |
+
if (!state.utterances.length) return;
|
| 393 |
+
const idx = findActiveUtterance(elements.audioPlayer.currentTime);
|
| 394 |
+
if (idx >= 0) updateActiveUtterance(idx);
|
| 395 |
+
});
|
| 396 |
+
|
| 397 |
+
elements.transcriptList.addEventListener('click', (event) => {
|
| 398 |
+
const item = event.target.closest('.utterance-item');
|
| 399 |
+
if (!item) return;
|
| 400 |
+
const editButton = event.target.closest('.edit-btn');
|
| 401 |
+
const saveButton = event.target.closest('.save-edit');
|
| 402 |
+
const cancelButton = event.target.closest('.cancel-edit');
|
| 403 |
+
|
| 404 |
+
const index = Number(item.dataset.index);
|
| 405 |
+
|
| 406 |
+
if (editButton) {
|
| 407 |
+
toggleEdit(item, true);
|
| 408 |
+
return;
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
if (saveButton) {
|
| 412 |
+
const textarea = item.querySelector('textarea');
|
| 413 |
+
const newText = textarea.value.trim();
|
| 414 |
+
if (newText.length === 0) return;
|
| 415 |
+
state.utterances[index].text = newText;
|
| 416 |
+
item.querySelector('.utterance-text').textContent = newText;
|
| 417 |
+
toggleEdit(item, false);
|
| 418 |
+
return;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
if (cancelButton) {
|
| 422 |
+
toggleEdit(item, false);
|
| 423 |
+
return;
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
const start = Number(item.dataset.start);
|
| 427 |
+
seekToTime(start);
|
| 428 |
+
});
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
function toggleEdit(item, editing) {
|
| 432 |
+
const textBlock = item.querySelector('.utterance-text');
|
| 433 |
+
const editArea = item.querySelector('.edit-area');
|
| 434 |
+
if (!textBlock || !editArea) return;
|
| 435 |
+
|
| 436 |
+
if (editing) {
|
| 437 |
+
const textarea = editArea.querySelector('textarea');
|
| 438 |
+
textarea.value = textBlock.textContent;
|
| 439 |
+
textBlock.classList.add('hidden');
|
| 440 |
+
editArea.classList.remove('hidden');
|
| 441 |
+
} else {
|
| 442 |
+
textBlock.classList.remove('hidden');
|
| 443 |
+
editArea.classList.add('hidden');
|
| 444 |
+
}
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
function seekToTime(timeInSeconds) {
|
| 448 |
+
if (!Number.isFinite(timeInSeconds)) return;
|
| 449 |
+
const audio = elements.audioPlayer;
|
| 450 |
+
|
| 451 |
+
const executeSeek = () => {
|
| 452 |
+
audio.currentTime = Math.max(0, timeInSeconds);
|
| 453 |
+
updateActiveUtterance(findActiveUtterance(audio.currentTime));
|
| 454 |
+
audio.play().catch(() => {});
|
| 455 |
+
};
|
| 456 |
+
|
| 457 |
+
if (audio.readyState >= 1) {
|
| 458 |
+
executeSeek();
|
| 459 |
+
} else {
|
| 460 |
+
const onLoaded = () => {
|
| 461 |
+
executeSeek();
|
| 462 |
+
audio.removeEventListener('loadedmetadata', onLoaded);
|
| 463 |
+
};
|
| 464 |
+
audio.addEventListener('loadedmetadata', onLoaded);
|
| 465 |
+
audio.load();
|
| 466 |
+
}
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
async function handleSummaryGeneration() {
|
| 470 |
+
if (state.summarizing || !state.utterances.length) return;
|
| 471 |
+
state.summarizing = true;
|
| 472 |
+
setStatus('Generating summary...', 'info');
|
| 473 |
+
elements.summaryOutput.textContent = '';
|
| 474 |
+
|
| 475 |
+
const payload = {
|
| 476 |
+
transcript: state.utterances.map((u) => u.text).join('\n'),
|
| 477 |
+
llm_model: elements.llmSelect.value,
|
| 478 |
+
prompt: elements.promptInput.value || 'Summarize the transcript below.',
|
| 479 |
+
};
|
| 480 |
+
|
| 481 |
+
try {
|
| 482 |
+
const response = await fetch('/api/summarize', {
|
| 483 |
+
method: 'POST',
|
| 484 |
+
headers: { 'Content-Type': 'application/json' },
|
| 485 |
+
body: JSON.stringify(payload),
|
| 486 |
+
});
|
| 487 |
+
|
| 488 |
+
if (!response.ok || !response.body) throw new Error('Failed to generate summary');
|
| 489 |
+
|
| 490 |
+
const reader = response.body.getReader();
|
| 491 |
+
const decoder = new TextDecoder();
|
| 492 |
+
let buffer = '';
|
| 493 |
+
|
| 494 |
+
while (true) {
|
| 495 |
+
const { done, value } = await reader.read();
|
| 496 |
+
if (done) break;
|
| 497 |
+
buffer += decoder.decode(value, { stream: true });
|
| 498 |
+
let lines = buffer.split('\n');
|
| 499 |
+
buffer = lines.pop();
|
| 500 |
+
for (const line of lines) {
|
| 501 |
+
if (!line.trim()) continue;
|
| 502 |
+
const event = JSON.parse(line);
|
| 503 |
+
if (event.type === 'partial' && event.content) {
|
| 504 |
+
elements.summaryOutput.textContent = event.content;
|
| 505 |
+
}
|
| 506 |
+
}
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
setStatus('Summary ready', 'success');
|
| 510 |
+
} catch (err) {
|
| 511 |
+
console.error(err);
|
| 512 |
+
setStatus(err.message, 'error');
|
| 513 |
+
} finally {
|
| 514 |
+
state.summarizing = false;
|
| 515 |
+
}
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
async function handleExportTranscript() {
|
| 519 |
+
if (!state.utterances.length) return;
|
| 520 |
+
const payload = {
|
| 521 |
+
format: elements.transcriptFormat.value,
|
| 522 |
+
include_timestamps: elements.includeTimestamps.checked,
|
| 523 |
+
utterances: state.utterances,
|
| 524 |
+
};
|
| 525 |
+
await downloadFile('/api/export/transcript', payload, 'transcript');
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
async function handleExportSummary() {
|
| 529 |
+
if (!elements.summaryOutput.textContent.trim()) return;
|
| 530 |
+
const payload = {
|
| 531 |
+
format: elements.summaryFormat.value,
|
| 532 |
+
summary: elements.summaryOutput.textContent,
|
| 533 |
+
metadata: {},
|
| 534 |
+
};
|
| 535 |
+
await downloadFile('/api/export/summary', payload, 'summary');
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
async function downloadFile(url, payload, prefix) {
|
| 539 |
+
try {
|
| 540 |
+
const response = await fetch(url, {
|
| 541 |
+
method: 'POST',
|
| 542 |
+
headers: { 'Content-Type': 'application/json' },
|
| 543 |
+
body: JSON.stringify(payload),
|
| 544 |
+
});
|
| 545 |
+
if (!response.ok) throw new Error('Export failed');
|
| 546 |
+
const blob = await response.blob();
|
| 547 |
+
const filename = getFilenameFromDisposition(response.headers.get('Content-Disposition')) || `${prefix}.txt`;
|
| 548 |
+
const link = document.createElement('a');
|
| 549 |
+
link.href = URL.createObjectURL(blob);
|
| 550 |
+
link.download = filename;
|
| 551 |
+
link.click();
|
| 552 |
+
URL.revokeObjectURL(link.href);
|
| 553 |
+
setStatus('Export complete', 'success');
|
| 554 |
+
} catch (err) {
|
| 555 |
+
console.error(err);
|
| 556 |
+
setStatus(err.message, 'error');
|
| 557 |
+
}
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
function getFilenameFromDisposition(disposition) {
|
| 561 |
+
if (!disposition) return null;
|
| 562 |
+
const match = disposition.match(/filename="?([^"]+)"?/i);
|
| 563 |
+
return match ? match[1] : null;
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
function handleFileUpload(event) {
|
| 567 |
+
const file = event.target.files?.[0];
|
| 568 |
+
if (!file) return;
|
| 569 |
+
state.uploadedFile = file;
|
| 570 |
+
state.audioUrl = null;
|
| 571 |
+
const objectUrl = URL.createObjectURL(file);
|
| 572 |
+
elements.audioPlayer.src = objectUrl;
|
| 573 |
+
setStatus(`Loaded ${file.name}`, 'info');
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
async function handleYoutubeFetch() {
|
| 577 |
+
if (!elements.youtubeUrl.value.trim()) return;
|
| 578 |
+
setStatus('Downloading audio from YouTube...', 'info');
|
| 579 |
+
try {
|
| 580 |
+
const res = await fetch('/api/youtube/fetch', {
|
| 581 |
+
method: 'POST',
|
| 582 |
+
headers: { 'Content-Type': 'application/json' },
|
| 583 |
+
body: JSON.stringify({ url: elements.youtubeUrl.value.trim() }),
|
| 584 |
+
});
|
| 585 |
+
if (!res.ok) throw new Error('YouTube download failed');
|
| 586 |
+
const data = await res.json();
|
| 587 |
+
state.audioUrl = data.audioUrl;
|
| 588 |
+
state.uploadedFile = null;
|
| 589 |
+
elements.audioPlayer.src = data.audioUrl;
|
| 590 |
+
setStatus('YouTube audio ready', 'success');
|
| 591 |
+
} catch (err) {
|
| 592 |
+
console.error(err);
|
| 593 |
+
setStatus(err.message, 'error');
|
| 594 |
+
}
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
async function handlePodcastSearch() {
|
| 598 |
+
const query = elements.podcastQuery.value.trim();
|
| 599 |
+
if (!query) return;
|
| 600 |
+
setStatus('Searching podcasts...', 'info');
|
| 601 |
+
setListEmpty(elements.podcastResults, 'Searching podcasts...');
|
| 602 |
+
setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
|
| 603 |
+
try {
|
| 604 |
+
const res = await fetch(`/api/podcast/search?query=${encodeURIComponent(query)}`);
|
| 605 |
+
if (!res.ok) throw new Error('Podcast search failed');
|
| 606 |
+
const series = await res.json();
|
| 607 |
+
if (!series.length) {
|
| 608 |
+
setListEmpty(elements.podcastResults, 'No podcasts match your search yet.');
|
| 609 |
+
return;
|
| 610 |
+
}
|
| 611 |
+
elements.podcastResults.innerHTML = '';
|
| 612 |
+
const fragment = document.createDocumentFragment();
|
| 613 |
+
series.forEach((item) => {
|
| 614 |
+
const div = document.createElement('div');
|
| 615 |
+
div.className = 'list-item';
|
| 616 |
+
div.innerHTML = `
|
| 617 |
+
<div>
|
| 618 |
+
<strong>${item.title}</strong><br/>
|
| 619 |
+
<span>${item.artist || 'Unknown artist'}</span>
|
| 620 |
+
</div>
|
| 621 |
+
<button data-feed="${item.feed_url}">Episodes</button>
|
| 622 |
+
`;
|
| 623 |
+
fragment.appendChild(div);
|
| 624 |
+
});
|
| 625 |
+
elements.podcastResults.appendChild(fragment);
|
| 626 |
+
setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
|
| 627 |
+
} catch (err) {
|
| 628 |
+
console.error(err);
|
| 629 |
+
setStatus(err.message, 'error');
|
| 630 |
+
setListEmpty(elements.podcastResults, 'Unable to load podcasts right now.');
|
| 631 |
+
}
|
| 632 |
+
}
|
| 633 |
+
|
| 634 |
+
async function loadEpisodes(feedUrl, sourceItem = null) {
|
| 635 |
+
setStatus('Loading episodes...', 'info');
|
| 636 |
+
if (sourceItem) {
|
| 637 |
+
elements.podcastResults.querySelectorAll('.list-item').forEach((item) => item.classList.remove('selected'));
|
| 638 |
+
sourceItem.classList.add('selected');
|
| 639 |
+
}
|
| 640 |
+
setListEmpty(elements.episodeResults, 'Loading episodes...');
|
| 641 |
+
try {
|
| 642 |
+
const res = await fetch(`/api/podcast/episodes?feed_url=${encodeURIComponent(feedUrl)}`);
|
| 643 |
+
if (!res.ok) throw new Error('Failed to load episodes');
|
| 644 |
+
const episodes = await res.json();
|
| 645 |
+
if (!episodes.length) {
|
| 646 |
+
setListEmpty(elements.episodeResults, 'No episodes available for this podcast.');
|
| 647 |
+
return;
|
| 648 |
+
}
|
| 649 |
+
elements.episodeResults.innerHTML = '';
|
| 650 |
+
const fragment = document.createDocumentFragment();
|
| 651 |
+
episodes.slice(0, 15).forEach((ep) => {
|
| 652 |
+
const div = document.createElement('div');
|
| 653 |
+
div.className = 'list-item';
|
| 654 |
+
div.innerHTML = `
|
| 655 |
+
<div>
|
| 656 |
+
<strong>${ep.title}</strong><br/>
|
| 657 |
+
<span>${ep.published || ''}</span>
|
| 658 |
+
</div>
|
| 659 |
+
<button data-url="${ep.audio_url}" data-title="${ep.title}">Download</button>
|
| 660 |
+
`;
|
| 661 |
+
fragment.appendChild(div);
|
| 662 |
+
});
|
| 663 |
+
elements.episodeResults.appendChild(fragment);
|
| 664 |
+
setStatus('Episodes ready', 'success');
|
| 665 |
+
} catch (err) {
|
| 666 |
+
console.error(err);
|
| 667 |
+
setStatus(err.message, 'error');
|
| 668 |
+
setListEmpty(elements.episodeResults, 'Unable to load episodes right now.');
|
| 669 |
+
}
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
async function downloadEpisode(audioUrl, title, triggerButton = null) {
|
| 673 |
+
setStatus('Downloading episode...', 'info');
|
| 674 |
+
let originalLabel = null;
|
| 675 |
+
if (triggerButton) {
|
| 676 |
+
originalLabel = triggerButton.innerHTML;
|
| 677 |
+
triggerButton.disabled = true;
|
| 678 |
+
triggerButton.classList.add('loading');
|
| 679 |
+
triggerButton.textContent = 'Downloading…';
|
| 680 |
+
}
|
| 681 |
+
try {
|
| 682 |
+
const res = await fetch('/api/podcast/download', {
|
| 683 |
+
method: 'POST',
|
| 684 |
+
headers: { 'Content-Type': 'application/json' },
|
| 685 |
+
body: JSON.stringify({ audioUrl, title }),
|
| 686 |
+
});
|
| 687 |
+
if (!res.ok) throw new Error('Episode download failed');
|
| 688 |
+
const data = await res.json();
|
| 689 |
+
state.audioUrl = data.audioUrl;
|
| 690 |
+
state.uploadedFile = null;
|
| 691 |
+
elements.audioPlayer.src = data.audioUrl;
|
| 692 |
+
setStatus('Episode ready', 'success');
|
| 693 |
+
if (triggerButton) {
|
| 694 |
+
triggerButton.textContent = 'Ready ✓';
|
| 695 |
+
triggerButton.classList.add('success');
|
| 696 |
+
}
|
| 697 |
+
} catch (err) {
|
| 698 |
+
console.error(err);
|
| 699 |
+
setStatus(err.message, 'error');
|
| 700 |
+
if (triggerButton) {
|
| 701 |
+
triggerButton.textContent = 'Retry';
|
| 702 |
+
triggerButton.classList.add('error');
|
| 703 |
+
}
|
| 704 |
+
} finally {
|
| 705 |
+
if (triggerButton) {
|
| 706 |
+
triggerButton.disabled = false;
|
| 707 |
+
triggerButton.classList.remove('loading');
|
| 708 |
+
setTimeout(() => {
|
| 709 |
+
triggerButton.classList.remove('success', 'error');
|
| 710 |
+
triggerButton.textContent = originalLabel || 'Download';
|
| 711 |
+
}, 2000);
|
| 712 |
+
}
|
| 713 |
+
}
|
| 714 |
+
}
|
| 715 |
+
|
| 716 |
+
function initPodcastInteractions() {
|
| 717 |
+
elements.podcastResults.addEventListener('click', (event) => {
|
| 718 |
+
const btn = event.target.closest('button[data-feed]');
|
| 719 |
+
if (!btn) return;
|
| 720 |
+
const listItem = btn.closest('.list-item');
|
| 721 |
+
loadEpisodes(btn.dataset.feed, listItem);
|
| 722 |
+
});
|
| 723 |
+
|
| 724 |
+
elements.episodeResults.addEventListener('click', (event) => {
|
| 725 |
+
const btn = event.target.closest('button[data-url]');
|
| 726 |
+
if (!btn) return;
|
| 727 |
+
downloadEpisode(btn.dataset.url, btn.dataset.title, btn);
|
| 728 |
+
});
|
| 729 |
+
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
+
function initEventBindings() {
|
| 733 |
+
elements.transcribeBtn.addEventListener('click', handleTranscription);
|
| 734 |
+
elements.summaryBtn.addEventListener('click', handleSummaryGeneration);
|
| 735 |
+
elements.exportTranscriptBtn.addEventListener('click', handleExportTranscript);
|
| 736 |
+
elements.exportSummaryBtn.addEventListener('click', handleExportSummary);
|
| 737 |
+
elements.fileInput.addEventListener('change', handleFileUpload);
|
| 738 |
+
elements.youtubeFetch.addEventListener('click', handleYoutubeFetch);
|
| 739 |
+
elements.podcastSearch.addEventListener('click', handlePodcastSearch);
|
| 740 |
+
elements.podcastQuery.addEventListener('keydown', (event) => {
|
| 741 |
+
if (event.key === 'Enter') {
|
| 742 |
+
event.preventDefault();
|
| 743 |
+
handlePodcastSearch();
|
| 744 |
+
}
|
| 745 |
+
});
|
| 746 |
+
}
|
| 747 |
+
|
| 748 |
+
async function init() {
|
| 749 |
+
initTabs();
|
| 750 |
+
initSidebarInteractions();
|
| 751 |
+
initAudioInteractions();
|
| 752 |
+
initEventBindings();
|
| 753 |
+
initPodcastInteractions();
|
| 754 |
+
|
| 755 |
+
elements.backendSelect.innerHTML = `
|
| 756 |
+
<option value="moonshine">Moonshine</option>
|
| 757 |
+
<option value="sensevoice" selected>SenseVoice</option>
|
| 758 |
+
`;
|
| 759 |
+
state.backend = elements.backendSelect.value;
|
| 760 |
+
|
| 761 |
+
setListEmpty(elements.podcastResults, 'Search to discover podcasts.');
|
| 762 |
+
setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
|
| 763 |
+
|
| 764 |
+
await fetchConfig();
|
| 765 |
+
setStatus('Ready');
|
| 766 |
+
}
|
| 767 |
+
|
| 768 |
+
init();
|
frontend/index.html
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>VoxSum Studio</title>
|
| 7 |
+
<link rel="stylesheet" href="/styles.css" />
|
| 8 |
+
</head>
|
| 9 |
+
<body>
|
| 10 |
+
<header class="app-header">
|
| 11 |
+
<h1>VoxSum Studio</h1>
|
| 12 |
+
<p class="tagline">Transform Audio into Insightful Summaries</p>
|
| 13 |
+
</header>
|
| 14 |
+
<div class="app-shell">
|
| 15 |
+
<aside class="sidebar">
|
| 16 |
+
<section class="panel">
|
| 17 |
+
<h2>ASR Settings</h2>
|
| 18 |
+
<label for="backend-select">Backend</label>
|
| 19 |
+
<select id="backend-select"></select>
|
| 20 |
+
|
| 21 |
+
<label for="model-select">Model</label>
|
| 22 |
+
<select id="model-select"></select>
|
| 23 |
+
|
| 24 |
+
<div id="sensevoice-options" class="conditional hidden">
|
| 25 |
+
<label for="sensevoice-language">Language</label>
|
| 26 |
+
<select id="sensevoice-language">
|
| 27 |
+
<option value="auto">Auto</option>
|
| 28 |
+
<option value="zh">Chinese</option>
|
| 29 |
+
<option value="en">English</option>
|
| 30 |
+
<option value="ja">Japanese</option>
|
| 31 |
+
<option value="ko">Korean</option>
|
| 32 |
+
<option value="yue">Cantonese</option>
|
| 33 |
+
</select>
|
| 34 |
+
|
| 35 |
+
<label>Text Normalization</label>
|
| 36 |
+
<div class="radio-group">
|
| 37 |
+
<label><input type="radio" name="textnorm" value="withitn" checked /> With ITN</label>
|
| 38 |
+
<label><input type="radio" name="textnorm" value="noitn" /> Raw</label>
|
| 39 |
+
</div>
|
| 40 |
+
</div>
|
| 41 |
+
|
| 42 |
+
<label for="vad-threshold">VAD Threshold</label>
|
| 43 |
+
<input id="vad-threshold" type="range" min="0.1" max="0.9" step="0.05" value="0.5" />
|
| 44 |
+
<span id="vad-value" class="hint">0.50</span>
|
| 45 |
+
</section>
|
| 46 |
+
|
| 47 |
+
<section class="panel">
|
| 48 |
+
<h2>Diarization</h2>
|
| 49 |
+
<label class="toggle">
|
| 50 |
+
<input id="diarization-toggle" type="checkbox" /> Enable speaker diarization
|
| 51 |
+
</label>
|
| 52 |
+
<div id="diarization-settings" class="conditional hidden">
|
| 53 |
+
<label for="num-speakers">Number of speakers (-1 = auto)</label>
|
| 54 |
+
<input id="num-speakers" type="number" min="-1" max="10" value="-1" />
|
| 55 |
+
|
| 56 |
+
<label for="cluster-threshold">Cluster threshold</label>
|
| 57 |
+
<input id="cluster-threshold" type="range" min="0.1" max="1" step="0.05" value="0.5" />
|
| 58 |
+
<span id="cluster-value" class="hint">0.50</span>
|
| 59 |
+
</div>
|
| 60 |
+
</section>
|
| 61 |
+
|
| 62 |
+
<section class="panel">
|
| 63 |
+
<h2>Summarization</h2>
|
| 64 |
+
<label for="llm-select">LLM Model</label>
|
| 65 |
+
<select id="llm-select"></select>
|
| 66 |
+
|
| 67 |
+
<label for="prompt-input">Custom Prompt</label>
|
| 68 |
+
<textarea id="prompt-input" rows="4">Summarize the transcript below.</textarea>
|
| 69 |
+
</section>
|
| 70 |
+
</aside>
|
| 71 |
+
|
| 72 |
+
<main class="content">
|
| 73 |
+
<nav class="tabs">
|
| 74 |
+
<button class="tab active" data-target="podcast-tab">🎙️ Podcast</button>
|
| 75 |
+
<button class="tab" data-target="audio-tab">🎵 Audio Input</button>
|
| 76 |
+
<button class="tab" data-target="results-tab">📄 Results</button>
|
| 77 |
+
</nav>
|
| 78 |
+
|
| 79 |
+
<section id="podcast-tab" class="tab-panel active">
|
| 80 |
+
<div class="panel">
|
| 81 |
+
<h2>Search Podcasts</h2>
|
| 82 |
+
<div class="form-row">
|
| 83 |
+
<input id="podcast-query" type="text" placeholder="Podcast title" />
|
| 84 |
+
<button id="podcast-search">Search</button>
|
| 85 |
+
</div>
|
| 86 |
+
<div class="list-grid">
|
| 87 |
+
<section class="list-section">
|
| 88 |
+
<header class="list-section-header">
|
| 89 |
+
<h3>Podcast Channels</h3>
|
| 90 |
+
<p class="list-hint">Pick a show to reveal recent episodes.</p>
|
| 91 |
+
</header>
|
| 92 |
+
<div id="podcast-results" class="list"></div>
|
| 93 |
+
</section>
|
| 94 |
+
<section class="list-section">
|
| 95 |
+
<header class="list-section-header">
|
| 96 |
+
<h3>Episodes</h3>
|
| 97 |
+
<p class="list-hint">Episodes for the selected podcast appear here.</p>
|
| 98 |
+
</header>
|
| 99 |
+
<div id="episode-results" class="list"></div>
|
| 100 |
+
</section>
|
| 101 |
+
</div>
|
| 102 |
+
</div>
|
| 103 |
+
</section>
|
| 104 |
+
|
| 105 |
+
<section id="audio-tab" class="tab-panel">
|
| 106 |
+
<div class="panel">
|
| 107 |
+
<h2>YouTube</h2>
|
| 108 |
+
<div class="form-row">
|
| 109 |
+
<input id="youtube-url" type="url" placeholder="https://youtube.com/..." />
|
| 110 |
+
<button id="youtube-fetch">Fetch Audio</button>
|
| 111 |
+
</div>
|
| 112 |
+
</div>
|
| 113 |
+
<div class="panel">
|
| 114 |
+
<h2>Upload Audio</h2>
|
| 115 |
+
<input id="file-input" type="file" accept="audio/*" />
|
| 116 |
+
</div>
|
| 117 |
+
</section>
|
| 118 |
+
|
| 119 |
+
<section id="results-tab" class="tab-panel">
|
| 120 |
+
<div class="actions">
|
| 121 |
+
<button id="transcribe-btn" class="primary">Transcribe Audio</button>
|
| 122 |
+
<button id="summary-btn" class="secondary">Generate Summary</button>
|
| 123 |
+
<span id="status-text" class="status-text">Ready</span>
|
| 124 |
+
</div>
|
| 125 |
+
|
| 126 |
+
<section class="panel">
|
| 127 |
+
<h2>Audio Player</h2>
|
| 128 |
+
<audio id="audio-player" controls preload="auto"></audio>
|
| 129 |
+
</section>
|
| 130 |
+
|
| 131 |
+
<section class="panel">
|
| 132 |
+
<div class="panel-header">
|
| 133 |
+
<h2>Transcript</h2>
|
| 134 |
+
<span id="utterance-count" class="hint"></span>
|
| 135 |
+
</div>
|
| 136 |
+
<div id="transcript-container">
|
| 137 |
+
<ul id="transcript-list"></ul>
|
| 138 |
+
</div>
|
| 139 |
+
</section>
|
| 140 |
+
|
| 141 |
+
<section id="diarization-summary" class="panel hidden">
|
| 142 |
+
<h2>Speaker Analysis</h2>
|
| 143 |
+
<div id="diarization-metrics"></div>
|
| 144 |
+
<div id="speaker-breakdown"></div>
|
| 145 |
+
</section>
|
| 146 |
+
|
| 147 |
+
<section class="panel">
|
| 148 |
+
<h2>Summary</h2>
|
| 149 |
+
<div id="summary-output" class="summary"></div>
|
| 150 |
+
</section>
|
| 151 |
+
|
| 152 |
+
<section class="panel">
|
| 153 |
+
<h2>Export</h2>
|
| 154 |
+
<div class="export-grid">
|
| 155 |
+
<div>
|
| 156 |
+
<label for="transcript-format">Transcript format</label>
|
| 157 |
+
<select id="transcript-format"></select>
|
| 158 |
+
</div>
|
| 159 |
+
<div>
|
| 160 |
+
<label class="toggle">
|
| 161 |
+
<input id="include-timestamps" type="checkbox" checked /> Include timestamps
|
| 162 |
+
</label>
|
| 163 |
+
</div>
|
| 164 |
+
<button id="export-transcript">Export Transcript</button>
|
| 165 |
+
<div>
|
| 166 |
+
<label for="summary-format">Summary format</label>
|
| 167 |
+
<select id="summary-format"></select>
|
| 168 |
+
</div>
|
| 169 |
+
<button id="export-summary">Export Summary</button>
|
| 170 |
+
</div>
|
| 171 |
+
</section>
|
| 172 |
+
</section>
|
| 173 |
+
</main>
|
| 174 |
+
</div>
|
| 175 |
+
|
| 176 |
+
<template id="utterance-template">
|
| 177 |
+
<li class="utterance-item">
|
| 178 |
+
<div class="utterance-header">
|
| 179 |
+
<span class="timestamp"></span>
|
| 180 |
+
<span class="speaker-tag hidden"></span>
|
| 181 |
+
<div class="utterance-actions">
|
| 182 |
+
<button class="edit-btn" title="Edit">✏️</button>
|
| 183 |
+
</div>
|
| 184 |
+
</div>
|
| 185 |
+
<div class="utterance-text"></div>
|
| 186 |
+
<div class="edit-area hidden">
|
| 187 |
+
<textarea rows="3"></textarea>
|
| 188 |
+
<div class="edit-controls">
|
| 189 |
+
<button class="save-edit">Save</button>
|
| 190 |
+
<button class="cancel-edit">Cancel</button>
|
| 191 |
+
</div>
|
| 192 |
+
</div>
|
| 193 |
+
</li>
|
| 194 |
+
</template>
|
| 195 |
+
|
| 196 |
+
<script src="/app.js" type="module"></script>
|
| 197 |
+
</body>
|
| 198 |
+
</html>
|
frontend/styles.css
ADDED
|
@@ -0,0 +1,448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
* {
|
| 2 |
+
box-sizing: border-box;
|
| 3 |
+
}
|
| 4 |
+
|
| 5 |
+
body {
|
| 6 |
+
margin: 0;
|
| 7 |
+
font-family: 'Inter', 'Segoe UI', sans-serif;
|
| 8 |
+
background: linear-gradient(180deg, #0f172a 0%, #111827 100%);
|
| 9 |
+
color: #e5e7eb;
|
| 10 |
+
min-height: 100vh;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
.app-header {
|
| 14 |
+
padding: 2rem 3rem 1.5rem;
|
| 15 |
+
background: rgba(15, 23, 42, 0.8);
|
| 16 |
+
backdrop-filter: blur(10px);
|
| 17 |
+
border-bottom: 1px solid rgba(148, 163, 184, 0.2);
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
.app-header h1 {
|
| 21 |
+
margin: 0;
|
| 22 |
+
font-size: 2.5rem;
|
| 23 |
+
letter-spacing: 0.05em;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.app-header .tagline {
|
| 27 |
+
margin: 0.5rem 0 0;
|
| 28 |
+
color: #94a3b8;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
.app-shell {
|
| 32 |
+
display: grid;
|
| 33 |
+
grid-template-columns: 320px 1fr;
|
| 34 |
+
gap: 1.5rem;
|
| 35 |
+
padding: 1.5rem 2rem 3rem;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
.sidebar {
|
| 39 |
+
display: flex;
|
| 40 |
+
flex-direction: column;
|
| 41 |
+
gap: 1.5rem;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.panel {
|
| 45 |
+
background: rgba(30, 41, 59, 0.7);
|
| 46 |
+
border: 1px solid rgba(148, 163, 184, 0.15);
|
| 47 |
+
border-radius: 16px;
|
| 48 |
+
padding: 1.25rem;
|
| 49 |
+
box-shadow: 0 20px 45px rgba(15, 23, 42, 0.35);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
.panel h2 {
|
| 53 |
+
margin: 0 0 1rem;
|
| 54 |
+
font-size: 1.1rem;
|
| 55 |
+
letter-spacing: 0.02em;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
.panel-header {
|
| 59 |
+
display: flex;
|
| 60 |
+
align-items: center;
|
| 61 |
+
justify-content: space-between;
|
| 62 |
+
margin-bottom: 0.5rem;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
label {
|
| 66 |
+
display: block;
|
| 67 |
+
font-size: 0.9rem;
|
| 68 |
+
margin-bottom: 0.35rem;
|
| 69 |
+
color: #cbd5f5;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
input[type="text"],
|
| 73 |
+
input[type="url"],
|
| 74 |
+
input[type="number"],
|
| 75 |
+
select,
|
| 76 |
+
textarea {
|
| 77 |
+
width: 100%;
|
| 78 |
+
padding: 0.6rem 0.75rem;
|
| 79 |
+
border-radius: 10px;
|
| 80 |
+
border: 1px solid rgba(148, 163, 184, 0.2);
|
| 81 |
+
background: rgba(15, 23, 42, 0.6);
|
| 82 |
+
color: #e5e7eb;
|
| 83 |
+
font: inherit;
|
| 84 |
+
transition: border-color 0.2s ease, box-shadow 0.2s ease;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
input:focus,
|
| 88 |
+
select:focus,
|
| 89 |
+
textarea:focus {
|
| 90 |
+
outline: none;
|
| 91 |
+
border-color: #38bdf8;
|
| 92 |
+
box-shadow: 0 0 0 2px rgba(56, 189, 248, 0.15);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
textarea {
|
| 96 |
+
resize: vertical;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
input[type="range"] {
|
| 100 |
+
width: 100%;
|
| 101 |
+
margin: 0.5rem 0;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.hint {
|
| 105 |
+
font-size: 0.8rem;
|
| 106 |
+
color: #94a3b8;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.toggle {
|
| 110 |
+
display: flex;
|
| 111 |
+
align-items: center;
|
| 112 |
+
gap: 0.6rem;
|
| 113 |
+
font-size: 0.9rem;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
.radio-group {
|
| 117 |
+
display: flex;
|
| 118 |
+
gap: 0.75rem;
|
| 119 |
+
margin-bottom: 0.5rem;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
.radio-group input {
|
| 123 |
+
margin-right: 0.35rem;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
.content {
|
| 127 |
+
display: flex;
|
| 128 |
+
flex-direction: column;
|
| 129 |
+
gap: 1.5rem;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
.tabs {
|
| 133 |
+
display: inline-flex;
|
| 134 |
+
background: rgba(30, 41, 59, 0.6);
|
| 135 |
+
border-radius: 999px;
|
| 136 |
+
padding: 0.4rem;
|
| 137 |
+
width: fit-content;
|
| 138 |
+
border: 1px solid rgba(148, 163, 184, 0.2);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
.tab {
|
| 142 |
+
border: none;
|
| 143 |
+
background: transparent;
|
| 144 |
+
color: #94a3b8;
|
| 145 |
+
padding: 0.6rem 1.2rem;
|
| 146 |
+
border-radius: 999px;
|
| 147 |
+
font: inherit;
|
| 148 |
+
cursor: pointer;
|
| 149 |
+
transition: all 0.2s ease;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
.tab.active {
|
| 153 |
+
background: linear-gradient(135deg, #38bdf8 0%, #818cf8 100%);
|
| 154 |
+
color: #0f172a;
|
| 155 |
+
font-weight: 600;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
.tab-panel {
|
| 159 |
+
display: none;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.tab-panel.active {
|
| 163 |
+
display: block;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
.form-row {
|
| 167 |
+
display: flex;
|
| 168 |
+
gap: 0.75rem;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
.form-row input {
|
| 172 |
+
flex: 1;
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
.list {
|
| 176 |
+
margin-top: 1rem;
|
| 177 |
+
display: grid;
|
| 178 |
+
gap: 0.75rem;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
.list-grid {
|
| 182 |
+
margin-top: 1.5rem;
|
| 183 |
+
display: grid;
|
| 184 |
+
gap: 1.25rem;
|
| 185 |
+
grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
|
| 186 |
+
align-items: start;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
.list-section {
|
| 190 |
+
background: rgba(15, 23, 42, 0.4);
|
| 191 |
+
border: 1px solid rgba(148, 163, 184, 0.18);
|
| 192 |
+
border-radius: 16px;
|
| 193 |
+
padding: 1rem;
|
| 194 |
+
display: flex;
|
| 195 |
+
flex-direction: column;
|
| 196 |
+
gap: 0.75rem;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
.list-section-header {
|
| 200 |
+
display: flex;
|
| 201 |
+
flex-direction: column;
|
| 202 |
+
gap: 0.35rem;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
.list-section-header h3 {
|
| 206 |
+
margin: 0;
|
| 207 |
+
font-size: 1rem;
|
| 208 |
+
letter-spacing: 0.02em;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
.list-hint {
|
| 212 |
+
margin: 0;
|
| 213 |
+
font-size: 0.85rem;
|
| 214 |
+
color: #9ca3af;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.list-item {
|
| 218 |
+
padding: 0.75rem;
|
| 219 |
+
border-radius: 12px;
|
| 220 |
+
background: rgba(15, 23, 42, 0.55);
|
| 221 |
+
border: 1px solid rgba(148, 163, 184, 0.15);
|
| 222 |
+
display: flex;
|
| 223 |
+
justify-content: space-between;
|
| 224 |
+
align-items: center;
|
| 225 |
+
gap: 1rem;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
.list-item button {
|
| 229 |
+
flex-shrink: 0;
|
| 230 |
+
display: inline-flex;
|
| 231 |
+
align-items: center;
|
| 232 |
+
gap: 0.45rem;
|
| 233 |
+
transition: all 0.2s ease;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
.list-item button.loading {
|
| 237 |
+
pointer-events: none;
|
| 238 |
+
opacity: 0.75;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
.list-item button.loading::before {
|
| 242 |
+
content: '';
|
| 243 |
+
width: 0.9rem;
|
| 244 |
+
height: 0.9rem;
|
| 245 |
+
border-radius: 50%;
|
| 246 |
+
border: 2px solid rgba(148, 163, 184, 0.4);
|
| 247 |
+
border-top-color: #38bdf8;
|
| 248 |
+
animation: spin 0.8s linear infinite;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
.list-item button.success {
|
| 252 |
+
background: rgba(34, 197, 94, 0.18);
|
| 253 |
+
border-color: rgba(34, 197, 94, 0.35);
|
| 254 |
+
color: #86efac;
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
.list-item button.error {
|
| 258 |
+
background: rgba(248, 113, 113, 0.18);
|
| 259 |
+
border-color: rgba(248, 113, 113, 0.35);
|
| 260 |
+
color: #fca5a5;
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
@keyframes spin {
|
| 264 |
+
to {
|
| 265 |
+
transform: rotate(360deg);
|
| 266 |
+
}
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
.list-item.selected {
|
| 270 |
+
border-color: rgba(56, 189, 248, 0.6);
|
| 271 |
+
box-shadow: 0 0 0 2px rgba(56, 189, 248, 0.15);
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
.empty-state {
|
| 275 |
+
padding: 1rem;
|
| 276 |
+
text-align: center;
|
| 277 |
+
border: 1px dashed rgba(148, 163, 184, 0.25);
|
| 278 |
+
border-radius: 12px;
|
| 279 |
+
color: #94a3b8;
|
| 280 |
+
font-size: 0.9rem;
|
| 281 |
+
background: rgba(15, 23, 42, 0.35);
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
.actions {
|
| 285 |
+
display: flex;
|
| 286 |
+
gap: 1rem;
|
| 287 |
+
align-items: center;
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
button {
|
| 291 |
+
border: none;
|
| 292 |
+
border-radius: 10px;
|
| 293 |
+
padding: 0.65rem 1.1rem;
|
| 294 |
+
font: inherit;
|
| 295 |
+
cursor: pointer;
|
| 296 |
+
color: #0f172a;
|
| 297 |
+
background: #e2e8f0;
|
| 298 |
+
transition: transform 0.2s ease, box-shadow 0.2s ease;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
button.primary {
|
| 302 |
+
background: linear-gradient(135deg, #38bdf8 0%, #818cf8 100%);
|
| 303 |
+
color: #0f172a;
|
| 304 |
+
font-weight: 600;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
button.secondary {
|
| 308 |
+
background: rgba(148, 163, 184, 0.2);
|
| 309 |
+
color: #e5e7eb;
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
button:hover {
|
| 313 |
+
transform: translateY(-1px);
|
| 314 |
+
box-shadow: 0 10px 25px rgba(15, 23, 42, 0.25);
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
.status-text {
|
| 318 |
+
color: #eab308;
|
| 319 |
+
font-size: 0.9rem;
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
#transcript-container {
|
| 323 |
+
max-height: 420px;
|
| 324 |
+
overflow: auto;
|
| 325 |
+
border-radius: 12px;
|
| 326 |
+
background: rgba(15, 23, 42, 0.4);
|
| 327 |
+
border: 1px solid rgba(148, 163, 184, 0.15);
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
#transcript-list {
|
| 331 |
+
list-style: none;
|
| 332 |
+
padding: 0;
|
| 333 |
+
margin: 0;
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
.utterance-item {
|
| 337 |
+
padding: 0.85rem 1rem;
|
| 338 |
+
border-bottom: 1px solid rgba(148, 163, 184, 0.1);
|
| 339 |
+
transition: background 0.2s ease;
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
.utterance-item:last-child {
|
| 343 |
+
border-bottom: none;
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
.utterance-item.active {
|
| 347 |
+
background: rgba(56, 189, 248, 0.15);
|
| 348 |
+
border-left: 3px solid #38bdf8;
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
.utterance-header {
|
| 352 |
+
display: flex;
|
| 353 |
+
align-items: center;
|
| 354 |
+
gap: 0.75rem;
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
.timestamp {
|
| 358 |
+
font-size: 0.8rem;
|
| 359 |
+
color: #94a3b8;
|
| 360 |
+
min-width: 70px;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
.speaker-tag {
|
| 364 |
+
font-size: 0.75rem;
|
| 365 |
+
padding: 0.1rem 0.5rem;
|
| 366 |
+
border-radius: 999px;
|
| 367 |
+
background: rgba(129, 140, 248, 0.2);
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
.utterance-actions {
|
| 371 |
+
margin-left: auto;
|
| 372 |
+
display: flex;
|
| 373 |
+
gap: 0.5rem;
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
.edit-btn {
|
| 377 |
+
background: rgba(148, 163, 184, 0.2);
|
| 378 |
+
color: #e5e7eb;
|
| 379 |
+
padding: 0.3rem 0.6rem;
|
| 380 |
+
font-size: 0.85rem;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
.utterance-text {
|
| 384 |
+
margin-top: 0.4rem;
|
| 385 |
+
line-height: 1.5;
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
.edit-area {
|
| 389 |
+
margin-top: 0.6rem;
|
| 390 |
+
display: grid;
|
| 391 |
+
gap: 0.5rem;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
.edit-area textarea {
|
| 395 |
+
width: 100%;
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
.edit-controls {
|
| 399 |
+
display: flex;
|
| 400 |
+
gap: 0.5rem;
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
.summary {
|
| 404 |
+
min-height: 120px;
|
| 405 |
+
background: rgba(15, 23, 42, 0.5);
|
| 406 |
+
border-radius: 12px;
|
| 407 |
+
padding: 1rem;
|
| 408 |
+
border: 1px solid rgba(148, 163, 184, 0.15);
|
| 409 |
+
white-space: pre-wrap;
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
.export-grid {
|
| 413 |
+
display: grid;
|
| 414 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
| 415 |
+
gap: 0.75rem;
|
| 416 |
+
align-items: end;
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
#diarization-metrics,
|
| 420 |
+
#speaker-breakdown {
|
| 421 |
+
display: grid;
|
| 422 |
+
gap: 0.75rem;
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
.metric-card {
|
| 426 |
+
padding: 0.75rem;
|
| 427 |
+
border-radius: 12px;
|
| 428 |
+
background: rgba(15, 23, 42, 0.5);
|
| 429 |
+
border: 1px solid rgba(148, 163, 184, 0.1);
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
.hidden {
|
| 433 |
+
display: none !important;
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
@media (max-width: 1100px) {
|
| 437 |
+
.app-shell {
|
| 438 |
+
grid-template-columns: 1fr;
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
.sidebar {
|
| 442 |
+
order: 2;
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
.content {
|
| 446 |
+
order: 1;
|
| 447 |
+
}
|
| 448 |
+
}
|
requirements.txt
CHANGED
|
@@ -1,7 +1,4 @@
|
|
| 1 |
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 2 |
-
altair
|
| 3 |
-
pandas
|
| 4 |
-
streamlit
|
| 5 |
numpy<2.0
|
| 6 |
soundfile
|
| 7 |
onnxruntime
|
|
@@ -16,4 +13,9 @@ ffmpeg-python
|
|
| 16 |
feedparser
|
| 17 |
sherpa_onnx
|
| 18 |
huggingface_hub
|
| 19 |
-
faiss-cpu
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
--extra-index-url https://download.pytorch.org/whl/cpu
|
|
|
|
|
|
|
|
|
|
| 2 |
numpy<2.0
|
| 3 |
soundfile
|
| 4 |
onnxruntime
|
|
|
|
| 13 |
feedparser
|
| 14 |
sherpa_onnx
|
| 15 |
huggingface_hub
|
| 16 |
+
faiss-cpu
|
| 17 |
+
fastapi
|
| 18 |
+
uvicorn[standard]
|
| 19 |
+
python-multipart
|
| 20 |
+
jinja2
|
| 21 |
+
aiofiles
|
src/asr.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
# asr.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import soundfile as sf
|
| 4 |
from scipy.signal import resample_poly
|
| 5 |
-
import re
|
| 6 |
-
from typing import Optional, Tuple, List
|
| 7 |
-
import tempfile
|
| 8 |
-
import os
|
| 9 |
|
| 10 |
# Lazy / optional imports: guard heavy or optional ASR backends
|
| 11 |
try:
|
|
@@ -20,11 +21,7 @@ except Exception:
|
|
| 20 |
MoonshineOnnxModel = None
|
| 21 |
load_tokenizer = None
|
| 22 |
|
| 23 |
-
from utils import
|
| 24 |
-
import re
|
| 25 |
-
from typing import Optional, Tuple, List
|
| 26 |
-
import tempfile
|
| 27 |
-
import os
|
| 28 |
|
| 29 |
SAMPLING_RATE = 16000
|
| 30 |
CHUNK_SIZE = 512
|
|
@@ -44,8 +41,8 @@ def transcribe_file(
|
|
| 44 |
model_name: str,
|
| 45 |
backend: str = "moonshine",
|
| 46 |
language: str = "auto",
|
| 47 |
-
textnorm: str = "withitn"
|
| 48 |
-
) -> Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]]]:
|
| 49 |
"""
|
| 50 |
Transcribe audio file using specified backend.
|
| 51 |
|
|
|
|
| 1 |
# asr.py
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import tempfile
|
| 5 |
+
from typing import Iterable, List, Optional, Tuple
|
| 6 |
+
|
| 7 |
import numpy as np
|
| 8 |
import soundfile as sf
|
| 9 |
from scipy.signal import resample_poly
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# Lazy / optional imports: guard heavy or optional ASR backends
|
| 12 |
try:
|
|
|
|
| 21 |
MoonshineOnnxModel = None
|
| 22 |
load_tokenizer = None
|
| 23 |
|
| 24 |
+
from .utils import load_sensevoice_model, s2tw_converter
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
SAMPLING_RATE = 16000
|
| 27 |
CHUNK_SIZE = 512
|
|
|
|
| 41 |
model_name: str,
|
| 42 |
backend: str = "moonshine",
|
| 43 |
language: str = "auto",
|
| 44 |
+
textnorm: str = "withitn",
|
| 45 |
+
) -> Iterable[Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]]]]:
|
| 46 |
"""
|
| 47 |
Transcribe audio file using specified backend.
|
| 48 |
|
src/diarization.py
CHANGED
|
@@ -17,12 +17,11 @@ import numpy as np
|
|
| 17 |
import sherpa_onnx
|
| 18 |
from pathlib import Path
|
| 19 |
from typing import List, Tuple, Optional, Callable, Dict, Any
|
| 20 |
-
import streamlit as st
|
| 21 |
import logging
|
| 22 |
-
from utils import get_writable_model_dir
|
| 23 |
-
from utils import num_vcpus
|
| 24 |
from huggingface_hub import hf_hub_download
|
| 25 |
import shutil
|
|
|
|
| 26 |
|
| 27 |
# Import the improved diarization pipeline (robust: search repo tree)
|
| 28 |
try:
|
|
@@ -95,11 +94,11 @@ def download_diarization_models():
|
|
| 95 |
repo_id = "csukuangfj/speaker-embedding-models"
|
| 96 |
filename = "3dspeaker_speech_campplus_sv_zh_en_16k-common_advanced.onnx"
|
| 97 |
embedding_model = models_dir / filename
|
| 98 |
-
|
| 99 |
try:
|
| 100 |
# Download using huggingface_hub if not present
|
| 101 |
if not embedding_model.exists():
|
| 102 |
-
|
| 103 |
downloaded_path = hf_hub_download(
|
| 104 |
repo_id=repo_id,
|
| 105 |
filename=filename,
|
|
@@ -111,10 +110,10 @@ def download_diarization_models():
|
|
| 111 |
# Move/copy to expected location if needed
|
| 112 |
if Path(downloaded_path) != embedding_model:
|
| 113 |
shutil.copy(downloaded_path, embedding_model)
|
| 114 |
-
|
| 115 |
return str(embedding_model), True
|
| 116 |
except Exception as e:
|
| 117 |
-
|
| 118 |
return None, False
|
| 119 |
|
| 120 |
def init_speaker_embedding_extractor(
|
|
@@ -137,26 +136,26 @@ def init_speaker_embedding_extractor(
|
|
| 137 |
embedding_model, success = download_diarization_models()
|
| 138 |
if not success:
|
| 139 |
return None
|
| 140 |
-
|
| 141 |
# Create embedding extractor config
|
| 142 |
embedding_config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
|
| 143 |
model=embedding_model,
|
| 144 |
num_threads=num_vcpus
|
| 145 |
)
|
| 146 |
-
|
| 147 |
# Initialize embedding extractor
|
| 148 |
embedding_extractor = sherpa_onnx.SpeakerEmbeddingExtractor(embedding_config)
|
| 149 |
-
|
| 150 |
# Store clustering parameters separately
|
| 151 |
config_dict = {
|
| 152 |
'cluster_threshold': cluster_threshold,
|
| 153 |
'num_speakers': num_speakers
|
| 154 |
}
|
| 155 |
-
|
| 156 |
return embedding_extractor, config_dict
|
| 157 |
-
|
| 158 |
except Exception as e:
|
| 159 |
-
|
| 160 |
return None
|
| 161 |
|
| 162 |
def perform_speaker_diarization_on_utterances(
|
|
@@ -195,19 +194,13 @@ def perform_speaker_diarization_on_utterances(
|
|
| 195 |
# Check sample rate
|
| 196 |
if sample_rate != 16000:
|
| 197 |
warning_msg = f"⚠️ Audio sample rate is {sample_rate}Hz, but 16kHz is optimal for diarization"
|
| 198 |
-
|
| 199 |
-
st.warning(warning_msg)
|
| 200 |
-
print(warning_msg)
|
| 201 |
|
| 202 |
if not utterances:
|
| 203 |
-
|
| 204 |
-
st.warning("⚠️ No utterances provided for diarization")
|
| 205 |
-
print("⚠️ No utterances provided for diarization")
|
| 206 |
return []
|
| 207 |
|
| 208 |
-
|
| 209 |
-
st.info(f"🎭 Extracting embeddings from {len(utterances)} utterance segments...")
|
| 210 |
-
print(f"🎭 Extracting embeddings from {len(utterances)} utterance segments...")
|
| 211 |
|
| 212 |
# Extract embeddings for each utterance segment
|
| 213 |
embeddings = []
|
|
@@ -258,12 +251,12 @@ def perform_speaker_diarization_on_utterances(
|
|
| 258 |
continue
|
| 259 |
|
| 260 |
if not embeddings:
|
| 261 |
-
|
| 262 |
print(f"❌ DEBUG: Failed to extract any embeddings from {len(utterances)} utterances")
|
| 263 |
return []
|
| 264 |
|
| 265 |
print(f"✅ DEBUG: Extracted {len(embeddings)} embeddings for clustering")
|
| 266 |
-
|
| 267 |
|
| 268 |
# Convert embeddings to numpy array
|
| 269 |
embeddings_array = np.array(embeddings)
|
|
@@ -272,7 +265,7 @@ def perform_speaker_diarization_on_utterances(
|
|
| 272 |
# Use enhanced diarization if available
|
| 273 |
if ENHANCED_DIARIZATION_AVAILABLE:
|
| 274 |
print("🚀 Using enhanced diarization with adaptive clustering...")
|
| 275 |
-
|
| 276 |
|
| 277 |
# Prepare utterances dict format for enhanced pipeline
|
| 278 |
utterances_dict = []
|
|
@@ -300,11 +293,11 @@ def perform_speaker_diarization_on_utterances(
|
|
| 300 |
|
| 301 |
quality_msg = f"🎯 Diarization Quality: {confidence} confidence ({quality})"
|
| 302 |
if quality in ['excellent', 'good']:
|
| 303 |
-
|
| 304 |
elif quality == 'fair':
|
| 305 |
-
|
| 306 |
else:
|
| 307 |
-
|
| 308 |
|
| 309 |
print(f"✅ Enhanced diarization quality report:")
|
| 310 |
print(f" - Quality: {quality}")
|
|
@@ -314,7 +307,7 @@ def perform_speaker_diarization_on_utterances(
|
|
| 314 |
print(f" - Speakers detected: {n_speakers}")
|
| 315 |
|
| 316 |
if quality_report['recommendations']:
|
| 317 |
-
|
| 318 |
|
| 319 |
# Convert back to tuple format
|
| 320 |
diarization_result = []
|
|
@@ -325,17 +318,17 @@ def perform_speaker_diarization_on_utterances(
|
|
| 325 |
progress_callback(1.0) # 100% complete
|
| 326 |
|
| 327 |
print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
|
| 328 |
-
|
| 329 |
|
| 330 |
return diarization_result
|
| 331 |
|
| 332 |
except Exception as e:
|
| 333 |
-
|
| 334 |
print(f"❌ Enhanced diarization failed: {e}")
|
| 335 |
# Fall back to original clustering
|
| 336 |
|
| 337 |
# Fallback to original clustering
|
| 338 |
-
|
| 339 |
print("⚠️ Using fallback clustering")
|
| 340 |
|
| 341 |
# >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
|
|
@@ -349,8 +342,6 @@ def perform_speaker_diarization_on_utterances(
|
|
| 349 |
print(error_msg)
|
| 350 |
import traceback
|
| 351 |
traceback.print_exc()
|
| 352 |
-
if hasattr(st, '_is_running_with_streamlit') and st._is_running_with_streamlit:
|
| 353 |
-
st.error(error_msg)
|
| 354 |
return []
|
| 355 |
|
| 356 |
def merge_transcription_with_diarization(
|
|
@@ -555,7 +546,7 @@ def faiss_clustering(embeddings: np.ndarray,
|
|
| 555 |
|
| 556 |
num_speakers = len(set(labels))
|
| 557 |
print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
|
| 558 |
-
|
| 559 |
|
| 560 |
return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]
|
| 561 |
|
|
|
|
| 17 |
import sherpa_onnx
|
| 18 |
from pathlib import Path
|
| 19 |
from typing import List, Tuple, Optional, Callable, Dict, Any
|
|
|
|
| 20 |
import logging
|
| 21 |
+
from .utils import get_writable_model_dir, num_vcpus
|
|
|
|
| 22 |
from huggingface_hub import hf_hub_download
|
| 23 |
import shutil
|
| 24 |
+
from sklearn.metrics import silhouette_score
|
| 25 |
|
| 26 |
# Import the improved diarization pipeline (robust: search repo tree)
|
| 27 |
try:
|
|
|
|
| 94 |
repo_id = "csukuangfj/speaker-embedding-models"
|
| 95 |
filename = "3dspeaker_speech_campplus_sv_zh_en_16k-common_advanced.onnx"
|
| 96 |
embedding_model = models_dir / filename
|
| 97 |
+
logger.info(f"Model cache directory: {models_dir}")
|
| 98 |
try:
|
| 99 |
# Download using huggingface_hub if not present
|
| 100 |
if not embedding_model.exists():
|
| 101 |
+
logger.info("📥 Downloading eres2netv2 Chinese speaker model from HuggingFace (29MB)...")
|
| 102 |
downloaded_path = hf_hub_download(
|
| 103 |
repo_id=repo_id,
|
| 104 |
filename=filename,
|
|
|
|
| 110 |
# Move/copy to expected location if needed
|
| 111 |
if Path(downloaded_path) != embedding_model:
|
| 112 |
shutil.copy(downloaded_path, embedding_model)
|
| 113 |
+
logger.info("✅ eres2netv2 Chinese embedding model downloaded!")
|
| 114 |
return str(embedding_model), True
|
| 115 |
except Exception as e:
|
| 116 |
+
logger.error(f"❌ Failed to download diarization models: {e}")
|
| 117 |
return None, False
|
| 118 |
|
| 119 |
def init_speaker_embedding_extractor(
|
|
|
|
| 136 |
embedding_model, success = download_diarization_models()
|
| 137 |
if not success:
|
| 138 |
return None
|
| 139 |
+
|
| 140 |
# Create embedding extractor config
|
| 141 |
embedding_config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
|
| 142 |
model=embedding_model,
|
| 143 |
num_threads=num_vcpus
|
| 144 |
)
|
| 145 |
+
|
| 146 |
# Initialize embedding extractor
|
| 147 |
embedding_extractor = sherpa_onnx.SpeakerEmbeddingExtractor(embedding_config)
|
| 148 |
+
|
| 149 |
# Store clustering parameters separately
|
| 150 |
config_dict = {
|
| 151 |
'cluster_threshold': cluster_threshold,
|
| 152 |
'num_speakers': num_speakers
|
| 153 |
}
|
| 154 |
+
|
| 155 |
return embedding_extractor, config_dict
|
| 156 |
+
|
| 157 |
except Exception as e:
|
| 158 |
+
logger.error(f"❌ Failed to initialize speaker embedding extractor: {e}")
|
| 159 |
return None
|
| 160 |
|
| 161 |
def perform_speaker_diarization_on_utterances(
|
|
|
|
| 194 |
# Check sample rate
|
| 195 |
if sample_rate != 16000:
|
| 196 |
warning_msg = f"⚠️ Audio sample rate is {sample_rate}Hz, but 16kHz is optimal for diarization"
|
| 197 |
+
logger.warning(warning_msg)
|
|
|
|
|
|
|
| 198 |
|
| 199 |
if not utterances:
|
| 200 |
+
logger.warning("⚠️ No utterances provided for diarization")
|
|
|
|
|
|
|
| 201 |
return []
|
| 202 |
|
| 203 |
+
logger.info(f"🎭 Extracting embeddings from {len(utterances)} utterance segments...")
|
|
|
|
|
|
|
| 204 |
|
| 205 |
# Extract embeddings for each utterance segment
|
| 206 |
embeddings = []
|
|
|
|
| 251 |
continue
|
| 252 |
|
| 253 |
if not embeddings:
|
| 254 |
+
logger.error("❌ No valid embeddings extracted")
|
| 255 |
print(f"❌ DEBUG: Failed to extract any embeddings from {len(utterances)} utterances")
|
| 256 |
return []
|
| 257 |
|
| 258 |
print(f"✅ DEBUG: Extracted {len(embeddings)} embeddings for clustering")
|
| 259 |
+
logger.info(f"✅ Extracted {len(embeddings)} embeddings, performing clustering...")
|
| 260 |
|
| 261 |
# Convert embeddings to numpy array
|
| 262 |
embeddings_array = np.array(embeddings)
|
|
|
|
| 265 |
# Use enhanced diarization if available
|
| 266 |
if ENHANCED_DIARIZATION_AVAILABLE:
|
| 267 |
print("🚀 Using enhanced diarization with adaptive clustering...")
|
| 268 |
+
logger.info("🚀 Using enhanced adaptive clustering...")
|
| 269 |
|
| 270 |
# Prepare utterances dict format for enhanced pipeline
|
| 271 |
utterances_dict = []
|
|
|
|
| 293 |
|
| 294 |
quality_msg = f"🎯 Diarization Quality: {confidence} confidence ({quality})"
|
| 295 |
if quality in ['excellent', 'good']:
|
| 296 |
+
logger.info(quality_msg)
|
| 297 |
elif quality == 'fair':
|
| 298 |
+
logger.warning(quality_msg)
|
| 299 |
else:
|
| 300 |
+
logger.error(quality_msg)
|
| 301 |
|
| 302 |
print(f"✅ Enhanced diarization quality report:")
|
| 303 |
print(f" - Quality: {quality}")
|
|
|
|
| 307 |
print(f" - Speakers detected: {n_speakers}")
|
| 308 |
|
| 309 |
if quality_report['recommendations']:
|
| 310 |
+
logger.info("💡 " + "; ".join(quality_report['recommendations']))
|
| 311 |
|
| 312 |
# Convert back to tuple format
|
| 313 |
diarization_result = []
|
|
|
|
| 318 |
progress_callback(1.0) # 100% complete
|
| 319 |
|
| 320 |
print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
|
| 321 |
+
logger.info(f"🎭 Enhanced clustering completed! Detected {n_speakers} speakers with {confidence} confidence")
|
| 322 |
|
| 323 |
return diarization_result
|
| 324 |
|
| 325 |
except Exception as e:
|
| 326 |
+
logger.error(f"❌ Enhanced diarization failed: {e}")
|
| 327 |
print(f"❌ Enhanced diarization failed: {e}")
|
| 328 |
# Fall back to original clustering
|
| 329 |
|
| 330 |
# Fallback to original clustering
|
| 331 |
+
logger.warning("⚠️ Using fallback clustering")
|
| 332 |
print("⚠️ Using fallback clustering")
|
| 333 |
|
| 334 |
# >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
|
|
|
|
| 342 |
print(error_msg)
|
| 343 |
import traceback
|
| 344 |
traceback.print_exc()
|
|
|
|
|
|
|
| 345 |
return []
|
| 346 |
|
| 347 |
def merge_transcription_with_diarization(
|
|
|
|
| 546 |
|
| 547 |
num_speakers = len(set(labels))
|
| 548 |
print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
|
| 549 |
+
logger.info(f"🎭 FAISS clustering completed! Detected {num_speakers} speakers")
|
| 550 |
|
| 551 |
return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]
|
| 552 |
|
src/server/__init__.py
ADDED
|
File without changes
|
src/server/core/config.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
BASE_DIR = Path(__file__).resolve().parents[3]
|
| 9 |
+
STATIC_DIR = BASE_DIR / "static"
|
| 10 |
+
AUDIO_DIR = STATIC_DIR / "audio"
|
| 11 |
+
MODEL_CACHE_DIR = BASE_DIR / "tmp" / "models"
|
| 12 |
+
FRONTEND_DIR = BASE_DIR / "frontend"
|
| 13 |
+
TMP_DIR = BASE_DIR / "tmp"
|
| 14 |
+
|
| 15 |
+
# Ensure required directories exist
|
| 16 |
+
for directory in (STATIC_DIR, AUDIO_DIR, MODEL_CACHE_DIR, TMP_DIR, FRONTEND_DIR):
|
| 17 |
+
directory.mkdir(parents=True, exist_ok=True)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class Settings:
|
| 21 |
+
app_name: str = "VoxSum Studio API"
|
| 22 |
+
static_dir: Path = STATIC_DIR
|
| 23 |
+
audio_dir: Path = AUDIO_DIR
|
| 24 |
+
frontend_dir: Path = FRONTEND_DIR
|
| 25 |
+
tmp_dir: Path = TMP_DIR
|
| 26 |
+
model_cache_dir: Path = MODEL_CACHE_DIR
|
| 27 |
+
max_audio_files: int = int(os.environ.get("VOXSUM_MAX_AUDIO_FILES", "20"))
|
| 28 |
+
transcription_chunk_size: int = 100
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@lru_cache(maxsize=1)
|
| 32 |
+
def get_settings() -> Settings:
|
| 33 |
+
return Settings()
|
src/server/main.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
from fastapi.staticfiles import StaticFiles
|
| 6 |
+
|
| 7 |
+
from .core.config import get_settings
|
| 8 |
+
from .routers.api import router as api_router
|
| 9 |
+
|
| 10 |
+
settings = get_settings()
|
| 11 |
+
|
| 12 |
+
app = FastAPI(title=settings.app_name)
|
| 13 |
+
|
| 14 |
+
app.add_middleware(
|
| 15 |
+
CORSMiddleware,
|
| 16 |
+
allow_origins=["*"],
|
| 17 |
+
allow_credentials=True,
|
| 18 |
+
allow_methods=["*"] ,
|
| 19 |
+
allow_headers=["*"],
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
app.include_router(api_router)
|
| 23 |
+
|
| 24 |
+
app.mount("/static", StaticFiles(directory=settings.static_dir), name="static")
|
| 25 |
+
app.mount("/media", StaticFiles(directory=settings.audio_dir), name="media")
|
| 26 |
+
app.mount("/", StaticFiles(directory=settings.frontend_dir, html=True), name="frontend")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@app.get("/health")
|
| 30 |
+
def healthcheck() -> dict[str, str]:
|
| 31 |
+
return {"status": "ok"}
|
src/server/routers/api.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
| 7 |
+
from fastapi.responses import StreamingResponse
|
| 8 |
+
|
| 9 |
+
from ..models.export import SummaryExportRequest, TranscriptExportRequest
|
| 10 |
+
from ..models.summarization import SummaryRequest
|
| 11 |
+
from ..models.transcription import TranscriptionRequest
|
| 12 |
+
from ..core.config import get_settings
|
| 13 |
+
from ..services import config_service, export_service, podcast_service
|
| 14 |
+
from ..services.asr_service import iter_transcription_events
|
| 15 |
+
from ..services.file_service import save_upload_file, store_audio_file
|
| 16 |
+
from ..services.summarization_service import iter_summary_events
|
| 17 |
+
|
| 18 |
+
router = APIRouter(prefix="/api")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@router.get("/config/models")
|
| 22 |
+
def fetch_model_catalog():
|
| 23 |
+
return config_service.get_model_catalog()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@router.post("/transcribe")
|
| 27 |
+
def transcribe_audio(
|
| 28 |
+
audio: UploadFile | None = File(default=None),
|
| 29 |
+
options: str = Form("{}"),
|
| 30 |
+
source: str | None = Form(default=None),
|
| 31 |
+
):
|
| 32 |
+
payload = TranscriptionRequest(**json.loads(options or "{}"))
|
| 33 |
+
|
| 34 |
+
cleanup_temp = False
|
| 35 |
+
if audio is not None:
|
| 36 |
+
temp_path = save_upload_file(audio)
|
| 37 |
+
_, audio_url = store_audio_file(temp_path)
|
| 38 |
+
cleanup_temp = True
|
| 39 |
+
elif source:
|
| 40 |
+
filename = Path(source).name
|
| 41 |
+
candidate_path = get_settings().audio_dir / filename
|
| 42 |
+
if not candidate_path.exists():
|
| 43 |
+
raise HTTPException(status_code=404, detail="Audio source not found")
|
| 44 |
+
temp_path = candidate_path
|
| 45 |
+
audio_url = source
|
| 46 |
+
else:
|
| 47 |
+
raise HTTPException(status_code=400, detail="Either audio upload or source is required")
|
| 48 |
+
|
| 49 |
+
def event_stream():
|
| 50 |
+
try:
|
| 51 |
+
for event in iter_transcription_events(temp_path, audio_url, payload):
|
| 52 |
+
yield json.dumps(event, ensure_ascii=False) + "\n"
|
| 53 |
+
finally:
|
| 54 |
+
if cleanup_temp:
|
| 55 |
+
temp_path.unlink(missing_ok=True)
|
| 56 |
+
|
| 57 |
+
return StreamingResponse(event_stream(), media_type="application/x-ndjson")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@router.post("/summarize")
|
| 61 |
+
def summarize_text(request: SummaryRequest):
|
| 62 |
+
def event_stream():
|
| 63 |
+
for event in iter_summary_events(request):
|
| 64 |
+
yield json.dumps(event, ensure_ascii=False) + "\n"
|
| 65 |
+
|
| 66 |
+
return StreamingResponse(event_stream(), media_type="application/x-ndjson")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@router.get("/podcast/search")
|
| 70 |
+
def search_podcast(query: str):
|
| 71 |
+
return podcast_service.search_series(query)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@router.get("/podcast/episodes")
|
| 75 |
+
def get_podcast_episodes(feed_url: str):
|
| 76 |
+
return podcast_service.list_episodes(feed_url)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@router.post("/podcast/download")
|
| 80 |
+
def download_episode(payload: dict):
|
| 81 |
+
audio_url = payload.get("audioUrl") or payload.get("audio_url")
|
| 82 |
+
title = payload.get("title", "Episode")
|
| 83 |
+
if not audio_url:
|
| 84 |
+
raise HTTPException(status_code=400, detail="audioUrl is required")
|
| 85 |
+
return podcast_service.download_episode(audio_url, title)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
@router.post("/youtube/fetch")
|
| 89 |
+
def fetch_youtube_audio(payload: dict):
|
| 90 |
+
url = payload.get("url") or payload.get("youtubeUrl")
|
| 91 |
+
if not url:
|
| 92 |
+
raise HTTPException(status_code=400, detail="url is required")
|
| 93 |
+
return podcast_service.fetch_youtube_audio(url)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@router.post("/export/transcript")
|
| 97 |
+
def export_transcript(payload: TranscriptExportRequest):
|
| 98 |
+
content, filename, mime_type = export_service.generate_transcript_export(payload)
|
| 99 |
+
return StreamingResponse(
|
| 100 |
+
iter([content.encode("utf-8")]),
|
| 101 |
+
media_type=mime_type,
|
| 102 |
+
headers={"Content-Disposition": f"attachment; filename={filename}"},
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@router.post("/export/summary")
|
| 107 |
+
def export_summary(payload: SummaryExportRequest):
|
| 108 |
+
content, filename, mime_type = export_service.generate_summary_export(payload)
|
| 109 |
+
return StreamingResponse(
|
| 110 |
+
iter([content.encode("utf-8")]),
|
| 111 |
+
media_type=mime_type,
|
| 112 |
+
headers={"Content-Disposition": f"attachment; filename={filename}"},
|
| 113 |
+
)
|
src/server/services/asr_service.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Dict, Iterable, List, Optional, Tuple
|
| 5 |
+
|
| 6 |
+
import soundfile as sf
|
| 7 |
+
from fastapi import HTTPException
|
| 8 |
+
|
| 9 |
+
from src.asr import transcribe_file
|
| 10 |
+
from src.diarization import (
|
| 11 |
+
get_diarization_stats,
|
| 12 |
+
init_speaker_embedding_extractor,
|
| 13 |
+
merge_consecutive_utterances,
|
| 14 |
+
merge_transcription_with_diarization,
|
| 15 |
+
perform_speaker_diarization_on_utterances,
|
| 16 |
+
)
|
| 17 |
+
from src.utils import sensevoice_models
|
| 18 |
+
|
| 19 |
+
from ..core.config import get_settings
|
| 20 |
+
from ..models.transcription import DiarizationOptions, TranscriptionRequest
|
| 21 |
+
|
| 22 |
+
settings = get_settings()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _serialize_utterance(utt: Tuple[float, float, str], speaker: Optional[int] = None) -> Dict[str, object]:
|
| 26 |
+
start, end, text = utt
|
| 27 |
+
payload: Dict[str, object] = {
|
| 28 |
+
"start": round(float(start), 3),
|
| 29 |
+
"end": round(float(end), 3),
|
| 30 |
+
"text": text,
|
| 31 |
+
}
|
| 32 |
+
if speaker is not None:
|
| 33 |
+
payload["speaker"] = int(speaker)
|
| 34 |
+
return payload
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _prepare_model_name(options: TranscriptionRequest) -> str:
|
| 38 |
+
if options.backend == "sensevoice":
|
| 39 |
+
# sensevoice_models stores map from friendly name to repo id
|
| 40 |
+
return sensevoice_models.get(options.model_name, options.model_name)
|
| 41 |
+
return options.model_name
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def iter_transcription_events(
|
| 45 |
+
audio_path: Path,
|
| 46 |
+
audio_url: str,
|
| 47 |
+
options: TranscriptionRequest,
|
| 48 |
+
) -> Iterable[Dict[str, object]]:
|
| 49 |
+
model_name = _prepare_model_name(options)
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
generator = transcribe_file(
|
| 53 |
+
audio_path=str(audio_path),
|
| 54 |
+
vad_threshold=options.vad_threshold,
|
| 55 |
+
model_name=model_name,
|
| 56 |
+
backend=options.backend,
|
| 57 |
+
language=options.language,
|
| 58 |
+
textnorm=options.textnorm,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
yield {
|
| 62 |
+
"type": "ready",
|
| 63 |
+
"audioUrl": audio_url,
|
| 64 |
+
"backend": options.backend,
|
| 65 |
+
"model": model_name,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
final_utterances: List[Tuple[float, float, str]] = []
|
| 69 |
+
|
| 70 |
+
for current_utterance, all_utterances in generator:
|
| 71 |
+
if current_utterance:
|
| 72 |
+
start, end, text = current_utterance
|
| 73 |
+
yield {
|
| 74 |
+
"type": "utterance",
|
| 75 |
+
"utterance": _serialize_utterance((start, end, text)),
|
| 76 |
+
"index": len(all_utterances) - 1,
|
| 77 |
+
}
|
| 78 |
+
final_utterances = list(all_utterances)
|
| 79 |
+
|
| 80 |
+
# Final event with transcript and optional diarization
|
| 81 |
+
diarization_payload = None
|
| 82 |
+
if options.diarization.enable:
|
| 83 |
+
diarization_payload = _run_diarization(audio_path, final_utterances, options.diarization)
|
| 84 |
+
|
| 85 |
+
transcript_text = "\n".join([utt[2] for utt in final_utterances])
|
| 86 |
+
|
| 87 |
+
yield {
|
| 88 |
+
"type": "complete",
|
| 89 |
+
"utterances": [_serialize_utterance(utt) for utt in final_utterances],
|
| 90 |
+
"transcript": transcript_text,
|
| 91 |
+
"diarization": diarization_payload,
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
except Exception as exc: # pragma: no cover
|
| 95 |
+
raise HTTPException(status_code=500, detail=f"Transcription failed: {exc}")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _run_diarization(
|
| 99 |
+
audio_path: Path,
|
| 100 |
+
utterances: List[Tuple[float, float, str]],
|
| 101 |
+
options: DiarizationOptions,
|
| 102 |
+
) -> Optional[Dict[str, object]]:
|
| 103 |
+
if not utterances:
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
extractor_result = init_speaker_embedding_extractor(
|
| 107 |
+
cluster_threshold=options.cluster_threshold,
|
| 108 |
+
num_speakers=options.num_speakers,
|
| 109 |
+
)
|
| 110 |
+
if not extractor_result:
|
| 111 |
+
return None
|
| 112 |
+
|
| 113 |
+
embedding_extractor, config_dict = extractor_result
|
| 114 |
+
|
| 115 |
+
audio, sample_rate = sf.read(str(audio_path), dtype="float32")
|
| 116 |
+
if audio.ndim > 1:
|
| 117 |
+
audio = audio.mean(axis=1)
|
| 118 |
+
|
| 119 |
+
if sample_rate != 16000:
|
| 120 |
+
# Lazy import to avoid mandatory dependency during module import
|
| 121 |
+
from scipy.signal import resample
|
| 122 |
+
|
| 123 |
+
target_num_samples = int(len(audio) * 16000 / sample_rate)
|
| 124 |
+
audio = resample(audio, target_num_samples)
|
| 125 |
+
sample_rate = 16000
|
| 126 |
+
|
| 127 |
+
diarization_segments = perform_speaker_diarization_on_utterances(
|
| 128 |
+
audio=audio,
|
| 129 |
+
sample_rate=sample_rate,
|
| 130 |
+
utterances=utterances,
|
| 131 |
+
embedding_extractor=embedding_extractor,
|
| 132 |
+
config_dict=config_dict,
|
| 133 |
+
progress_callback=None,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
if not diarization_segments:
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
merged = merge_transcription_with_diarization(utterances, diarization_segments)
|
| 140 |
+
merged = merge_consecutive_utterances(merged, max_gap=1.0)
|
| 141 |
+
stats = get_diarization_stats(merged)
|
| 142 |
+
|
| 143 |
+
return {
|
| 144 |
+
"utterances": [
|
| 145 |
+
_serialize_utterance((start, end, text), speaker)
|
| 146 |
+
for start, end, text, speaker in merged
|
| 147 |
+
],
|
| 148 |
+
"stats": stats,
|
| 149 |
+
}
|
src/server/services/config_service.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Dict
|
| 4 |
+
|
| 5 |
+
from src.utils import available_gguf_llms, model_names, sensevoice_models
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_model_catalog() -> Dict[str, object]:
|
| 9 |
+
return {
|
| 10 |
+
"moonshine": model_names,
|
| 11 |
+
"sensevoice": sensevoice_models,
|
| 12 |
+
"llms": {name: {"repo": repo, "filename": filename} for name, (repo, filename) in available_gguf_llms.items()},
|
| 13 |
+
}
|
src/server/services/export_service.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
|
| 6 |
+
from src.export_utils import (
|
| 7 |
+
SUBTITLE_FORMATS,
|
| 8 |
+
SUMMARY_FORMATS,
|
| 9 |
+
TRANSCRIPT_FORMATS,
|
| 10 |
+
export_plain_text,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
from ..models.export import SummaryExportRequest, TranscriptExportRequest
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _build_utterance_tuples(payload: TranscriptExportRequest):
|
| 17 |
+
utterances = [(u.start, u.end, u.text) for u in payload.utterances]
|
| 18 |
+
has_speakers = any(u.speaker is not None for u in payload.utterances)
|
| 19 |
+
utterances_with_speakers = None
|
| 20 |
+
if has_speakers:
|
| 21 |
+
utterances_with_speakers = [
|
| 22 |
+
(u.start, u.end, u.text, u.speaker if u.speaker is not None else 0)
|
| 23 |
+
for u in payload.utterances
|
| 24 |
+
]
|
| 25 |
+
return utterances, utterances_with_speakers
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def generate_transcript_export(payload: TranscriptExportRequest) -> Tuple[str, str, str]:
|
| 29 |
+
utterances, utterances_with_speakers = _build_utterance_tuples(payload)
|
| 30 |
+
|
| 31 |
+
if payload.format in SUBTITLE_FORMATS:
|
| 32 |
+
fmt = SUBTITLE_FORMATS[payload.format]
|
| 33 |
+
content = fmt["function"](utterances, utterances_with_speakers)
|
| 34 |
+
elif payload.format in TRANSCRIPT_FORMATS:
|
| 35 |
+
fmt = TRANSCRIPT_FORMATS[payload.format]
|
| 36 |
+
if payload.format == "Plain Text":
|
| 37 |
+
content = export_plain_text(
|
| 38 |
+
utterances,
|
| 39 |
+
utterances_with_speakers,
|
| 40 |
+
include_timestamps=payload.include_timestamps,
|
| 41 |
+
)
|
| 42 |
+
else:
|
| 43 |
+
content = fmt["function"](utterances, utterances_with_speakers)
|
| 44 |
+
else:
|
| 45 |
+
raise ValueError(f"Unsupported transcript export format: {payload.format}")
|
| 46 |
+
|
| 47 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 48 |
+
filename = f"transcript_{timestamp}{fmt['extension']}"
|
| 49 |
+
return content, filename, fmt["mime_type"]
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def generate_summary_export(payload: SummaryExportRequest) -> Tuple[str, str, str]:
|
| 53 |
+
if payload.format not in SUMMARY_FORMATS:
|
| 54 |
+
raise ValueError(f"Unsupported summary export format: {payload.format}")
|
| 55 |
+
|
| 56 |
+
fmt = SUMMARY_FORMATS[payload.format]
|
| 57 |
+
content = fmt["function"](payload.summary, payload.metadata)
|
| 58 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 59 |
+
filename = f"summary_{timestamp}{fmt['extension']}"
|
| 60 |
+
return content, filename, fmt["mime_type"]
|
src/server/services/file_service.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import shutil
|
| 4 |
+
import uuid
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Tuple
|
| 7 |
+
|
| 8 |
+
from fastapi import UploadFile
|
| 9 |
+
|
| 10 |
+
from ..core.config import get_settings
|
| 11 |
+
|
| 12 |
+
settings = get_settings()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def cleanup_old_audio_files(max_files: int | None = None) -> None:
|
| 16 |
+
"""Remove old audio files from the static directory to save space."""
|
| 17 |
+
max_files = max_files or settings.max_audio_files
|
| 18 |
+
audio_dir = settings.audio_dir
|
| 19 |
+
audio_dir.mkdir(parents=True, exist_ok=True)
|
| 20 |
+
|
| 21 |
+
files = sorted(audio_dir.glob("*") , key=lambda f: f.stat().st_mtime if f.exists() else 0)
|
| 22 |
+
if len(files) <= max_files:
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
for old_file in files[:-max_files]:
|
| 26 |
+
try:
|
| 27 |
+
old_file.unlink()
|
| 28 |
+
except OSError:
|
| 29 |
+
continue
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def save_upload_file(upload: UploadFile) -> Path:
|
| 33 |
+
"""Persist an UploadFile to the temporary directory and return its path."""
|
| 34 |
+
tmp_dir = settings.tmp_dir
|
| 35 |
+
tmp_dir.mkdir(parents=True, exist_ok=True)
|
| 36 |
+
suffix = Path(upload.filename or "audio").suffix or ".mp3"
|
| 37 |
+
temp_path = tmp_dir / f"upload_{uuid.uuid4().hex}{suffix}"
|
| 38 |
+
|
| 39 |
+
with temp_path.open("wb") as buffer:
|
| 40 |
+
shutil.copyfileobj(upload.file, buffer)
|
| 41 |
+
|
| 42 |
+
return temp_path
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def store_audio_file(audio_path: Path, prefix: str | None = None) -> Tuple[Path, str]:
|
| 46 |
+
"""Copy an audio file to the public static folder and return the new path and URL."""
|
| 47 |
+
cleanup_old_audio_files()
|
| 48 |
+
|
| 49 |
+
prefix = prefix or "audio"
|
| 50 |
+
suffix = audio_path.suffix or ".mp3"
|
| 51 |
+
dest_filename = f"{prefix}_{uuid.uuid4().hex}{suffix}"
|
| 52 |
+
dest_path = settings.audio_dir / dest_filename
|
| 53 |
+
|
| 54 |
+
shutil.copy2(audio_path, dest_path)
|
| 55 |
+
|
| 56 |
+
url = f"/media/{dest_filename}"
|
| 57 |
+
return dest_path, url
|
src/server/services/podcast_service.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Dict, List, Optional
|
| 5 |
+
|
| 6 |
+
from fastapi import HTTPException
|
| 7 |
+
|
| 8 |
+
from src.podcast import (
|
| 9 |
+
download_podcast_audio,
|
| 10 |
+
fetch_audio,
|
| 11 |
+
fetch_episodes,
|
| 12 |
+
search_podcast_series,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
from .file_service import store_audio_file
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def search_series(query: str) -> List[Dict[str, object]]:
|
| 19 |
+
return search_podcast_series(query)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def list_episodes(feed_url: str) -> List[Dict[str, object]]:
|
| 23 |
+
return fetch_episodes(feed_url)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def download_episode(audio_url: str, title: str) -> Dict[str, str]:
|
| 27 |
+
file_path, status = download_podcast_audio(audio_url, title, status="Podcast download")
|
| 28 |
+
if not file_path:
|
| 29 |
+
raise HTTPException(status_code=500, detail=status or "Download failed")
|
| 30 |
+
|
| 31 |
+
_, audio_url = store_audio_file(Path(file_path), prefix="podcast")
|
| 32 |
+
return {"audioUrl": audio_url, "status": status}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def fetch_youtube_audio(youtube_url: str) -> Dict[str, str]:
|
| 36 |
+
audio_path, status = fetch_audio(youtube_url, status="YouTube fetch")
|
| 37 |
+
if not audio_path:
|
| 38 |
+
raise HTTPException(status_code=500, detail=status or "YouTube download failed")
|
| 39 |
+
|
| 40 |
+
_, audio_url = store_audio_file(Path(audio_path), prefix="youtube")
|
| 41 |
+
return {"audioUrl": audio_url, "status": status}
|
src/server/services/summarization_service.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Dict, Iterable
|
| 4 |
+
|
| 5 |
+
from fastapi import HTTPException
|
| 6 |
+
|
| 7 |
+
from src.summarization import summarize_transcript
|
| 8 |
+
|
| 9 |
+
from ..models.summarization import SummaryRequest
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def iter_summary_events(payload: SummaryRequest) -> Iterable[Dict[str, str]]:
|
| 13 |
+
try:
|
| 14 |
+
generator = summarize_transcript(
|
| 15 |
+
transcript=payload.transcript,
|
| 16 |
+
selected_gguf_model=payload.llm_model,
|
| 17 |
+
prompt_input=payload.prompt,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
for chunk in generator:
|
| 21 |
+
yield {"type": "partial", "content": chunk}
|
| 22 |
+
|
| 23 |
+
yield {"type": "complete"}
|
| 24 |
+
|
| 25 |
+
except Exception as exc: # pragma: no cover
|
| 26 |
+
raise HTTPException(status_code=500, detail=f"Summary failed: {exc}")
|
src/summarization.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
# summarization.py
|
| 2 |
-
from llama_cpp import Llama
|
| 3 |
-
from utils import available_gguf_llms, s2tw_converter
|
| 4 |
import time
|
| 5 |
from functools import lru_cache
|
| 6 |
-
|
| 7 |
-
from
|
|
|
|
|
|
|
| 8 |
# Detect logical cores (vCPUs available to the container)
|
| 9 |
print(f"Detected vCPUs: {num_vcpus}")
|
| 10 |
|
|
|
|
| 1 |
# summarization.py
|
|
|
|
|
|
|
| 2 |
import time
|
| 3 |
from functools import lru_cache
|
| 4 |
+
|
| 5 |
+
from llama_cpp import Llama
|
| 6 |
+
|
| 7 |
+
from .utils import available_gguf_llms, num_vcpus, s2tw_converter
|
| 8 |
# Detect logical cores (vCPUs available to the container)
|
| 9 |
print(f"Detected vCPUs: {num_vcpus}")
|
| 10 |
|