Hervé BREDIN
commited on
feat: update to latest pyannote and wavesurfer (#3)
Browse files- app.py +47 -46
- assets/template.html +39 -44
- requirements.txt +1 -7
app.py
CHANGED
|
@@ -23,10 +23,10 @@
|
|
| 23 |
|
| 24 |
import io
|
| 25 |
import base64
|
|
|
|
| 26 |
import numpy as np
|
| 27 |
import scipy.io.wavfile
|
| 28 |
from typing import Text
|
| 29 |
-
from huggingface_hub import HfApi
|
| 30 |
import streamlit as st
|
| 31 |
from pyannote.audio import Pipeline
|
| 32 |
from pyannote.audio import Audio
|
|
@@ -49,32 +49,47 @@ def to_base64(waveform: np.ndarray, sample_rate: int = 16000) -> Text:
|
|
| 49 |
PYANNOTE_LOGO = "https://avatars.githubusercontent.com/u/7559051?s=400&v=4"
|
| 50 |
EXCERPT = 30.0
|
| 51 |
|
| 52 |
-
st.set_page_config(
|
| 53 |
-
page_title="pyannote.audio pretrained pipelines", page_icon=PYANNOTE_LOGO
|
| 54 |
-
)
|
| 55 |
|
|
|
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
st.markdown("""# 🎹 Pretrained pipelines
|
| 60 |
-
""")
|
| 61 |
|
| 62 |
PIPELINES = [
|
| 63 |
-
|
| 64 |
-
for p in HfApi().list_models(filter="pyannote-audio-pipeline")
|
| 65 |
-
if p.modelId.startswith("pyannote/")
|
| 66 |
]
|
| 67 |
|
| 68 |
audio = Audio(sample_rate=16000, mono=True)
|
| 69 |
|
| 70 |
-
selected_pipeline = st.selectbox("Select a pipeline", PIPELINES, index=0)
|
|
|
|
| 71 |
|
| 72 |
with st.spinner("Loading pipeline..."):
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
uploaded_file = st.file_uploader("
|
| 76 |
if uploaded_file is not None:
|
| 77 |
-
|
| 78 |
try:
|
| 79 |
duration = audio.get_duration(uploaded_file)
|
| 80 |
except RuntimeError as e:
|
|
@@ -86,12 +101,12 @@ if uploaded_file is not None:
|
|
| 86 |
uri = "".join(uploaded_file.name.split())
|
| 87 |
file = {"waveform": waveform, "sample_rate": sample_rate, "uri": uri}
|
| 88 |
|
| 89 |
-
with st.spinner(f"Processing
|
| 90 |
output = pipeline(file)
|
| 91 |
|
| 92 |
-
with open(
|
| 93 |
html_template = html.read()
|
| 94 |
-
st.markdown(
|
| 95 |
|
| 96 |
colors = [
|
| 97 |
"#ffd70033",
|
|
@@ -105,50 +120,36 @@ if uploaded_file is not None:
|
|
| 105 |
]
|
| 106 |
num_colors = len(colors)
|
| 107 |
|
| 108 |
-
label2color = {
|
|
|
|
|
|
|
| 109 |
|
| 110 |
BASE64 = to_base64(waveform.numpy().T)
|
| 111 |
|
| 112 |
REGIONS = ""
|
| 113 |
-
LEGENDS = ""
|
| 114 |
-
labels=[]
|
| 115 |
for segment, _, label in output.itertracks(yield_label=True):
|
| 116 |
-
REGIONS += f"
|
| 117 |
-
if not label in labels:
|
| 118 |
-
LEGENDS += f"<li><span style='background-color:{label2color[label]}'></span>{label}</li>"
|
| 119 |
-
labels.append(label)
|
| 120 |
|
| 121 |
html = html_template.replace("BASE64", BASE64).replace("REGIONS", REGIONS)
|
| 122 |
components.html(html, height=250, scrolling=True)
|
| 123 |
-
st.markdown("<div style='overflow : auto'><ul class='legend'>"+LEGENDS+"</ul></div>", unsafe_allow_html=True)
|
| 124 |
-
|
| 125 |
-
st.markdown("---")
|
| 126 |
|
| 127 |
with io.StringIO() as fp:
|
| 128 |
output.write_rttm(fp)
|
| 129 |
content = fp.getvalue()
|
| 130 |
-
|
| 131 |
b64 = base64.b64encode(content.encode()).decode()
|
| 132 |
-
href = f'
|
| 133 |
st.markdown(href, unsafe_allow_html=True)
|
| 134 |
|
| 135 |
code = f"""
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
st.code(code, language='python')
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
st.sidebar.markdown(
|
| 145 |
-
"""
|
| 146 |
-
-------------------
|
| 147 |
-
|
| 148 |
-
To use these pipelines on more and longer files on your own (GPU, hence much faster) servers, check the [documentation](https://github.com/pyannote/pyannote-audio).
|
| 149 |
|
| 150 |
-
|
|
|
|
|
|
|
| 151 |
|
| 152 |
-
|
| 153 |
-
"""
|
| 154 |
-
)
|
|
|
|
| 23 |
|
| 24 |
import io
|
| 25 |
import base64
|
| 26 |
+
import torch
|
| 27 |
import numpy as np
|
| 28 |
import scipy.io.wavfile
|
| 29 |
from typing import Text
|
|
|
|
| 30 |
import streamlit as st
|
| 31 |
from pyannote.audio import Pipeline
|
| 32 |
from pyannote.audio import Audio
|
|
|
|
| 49 |
PYANNOTE_LOGO = "https://avatars.githubusercontent.com/u/7559051?s=400&v=4"
|
| 50 |
EXCERPT = 30.0
|
| 51 |
|
| 52 |
+
st.set_page_config(page_title="pyannote pretrained pipelines", page_icon=PYANNOTE_LOGO)
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
col1, col2 = st.columns([0.2, 0.8], gap="small")
|
| 55 |
|
| 56 |
+
with col1:
|
| 57 |
+
st.image(PYANNOTE_LOGO)
|
| 58 |
+
|
| 59 |
+
with col2:
|
| 60 |
+
st.markdown(
|
| 61 |
+
"""
|
| 62 |
+
# pretrained pipelines
|
| 63 |
+
Make the most of [pyannote](https://github.com/pyannote) thanks to our [consulting services](https://herve.niderb.fr/consulting.html)
|
| 64 |
+
"""
|
| 65 |
+
)
|
| 66 |
|
|
|
|
|
|
|
| 67 |
|
| 68 |
PIPELINES = [
|
| 69 |
+
"pyannote/speaker-diarization-3.0",
|
|
|
|
|
|
|
| 70 |
]
|
| 71 |
|
| 72 |
audio = Audio(sample_rate=16000, mono=True)
|
| 73 |
|
| 74 |
+
selected_pipeline = st.selectbox("Select a pretrained pipeline", PIPELINES, index=0)
|
| 75 |
+
|
| 76 |
|
| 77 |
with st.spinner("Loading pipeline..."):
|
| 78 |
+
try:
|
| 79 |
+
use_auth_token = st.secrets["PYANNOTE_TOKEN"]
|
| 80 |
+
except FileNotFoundError:
|
| 81 |
+
use_auth_token = None
|
| 82 |
+
except KeyError:
|
| 83 |
+
use_auth_token = None
|
| 84 |
+
|
| 85 |
+
pipeline = Pipeline.from_pretrained(
|
| 86 |
+
selected_pipeline, use_auth_token=use_auth_token
|
| 87 |
+
)
|
| 88 |
+
if torch.cuda.is_available():
|
| 89 |
+
pipeline.to(torch.device("cuda"))
|
| 90 |
|
| 91 |
+
uploaded_file = st.file_uploader("Upload an audio file")
|
| 92 |
if uploaded_file is not None:
|
|
|
|
| 93 |
try:
|
| 94 |
duration = audio.get_duration(uploaded_file)
|
| 95 |
except RuntimeError as e:
|
|
|
|
| 101 |
uri = "".join(uploaded_file.name.split())
|
| 102 |
file = {"waveform": waveform, "sample_rate": sample_rate, "uri": uri}
|
| 103 |
|
| 104 |
+
with st.spinner(f"Processing {EXCERPT:g} seconds..."):
|
| 105 |
output = pipeline(file)
|
| 106 |
|
| 107 |
+
with open("assets/template.html") as html, open("assets/style.css") as css:
|
| 108 |
html_template = html.read()
|
| 109 |
+
st.markdown("<style>{}</style>".format(css.read()), unsafe_allow_html=True)
|
| 110 |
|
| 111 |
colors = [
|
| 112 |
"#ffd70033",
|
|
|
|
| 120 |
]
|
| 121 |
num_colors = len(colors)
|
| 122 |
|
| 123 |
+
label2color = {
|
| 124 |
+
label: colors[k % num_colors] for k, label in enumerate(sorted(output.labels()))
|
| 125 |
+
}
|
| 126 |
|
| 127 |
BASE64 = to_base64(waveform.numpy().T)
|
| 128 |
|
| 129 |
REGIONS = ""
|
|
|
|
|
|
|
| 130 |
for segment, _, label in output.itertracks(yield_label=True):
|
| 131 |
+
REGIONS += f"regions.addRegion({{start: {segment.start:g}, end: {segment.end:g}, color: '{label2color[label]}', resize : false, drag : false}});"
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
html = html_template.replace("BASE64", BASE64).replace("REGIONS", REGIONS)
|
| 134 |
components.html(html, height=250, scrolling=True)
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
with io.StringIO() as fp:
|
| 137 |
output.write_rttm(fp)
|
| 138 |
content = fp.getvalue()
|
|
|
|
| 139 |
b64 = base64.b64encode(content.encode()).decode()
|
| 140 |
+
href = f'<a download="{output.uri}.rttm" href="data:file/text;base64,{b64}">Download</a> result in RTTM file format or run it locally:'
|
| 141 |
st.markdown(href, unsafe_allow_html=True)
|
| 142 |
|
| 143 |
code = f"""
|
| 144 |
+
# load pretrained pipeline
|
| 145 |
+
from pyannote.audio import Pipeline
|
| 146 |
+
pipeline = Pipeline.from_pretrained("{selected_pipeline}",
|
| 147 |
+
use_auth_token=HUGGINGFACE_TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
+
# (optional) send pipeline to GPU
|
| 150 |
+
import torch
|
| 151 |
+
pipeline.to(torch.device("cuda"))
|
| 152 |
|
| 153 |
+
# process audio file
|
| 154 |
+
output = pipeline("audio.wav")"""
|
| 155 |
+
st.code(code, language="python")
|
assets/template.html
CHANGED
|
@@ -1,46 +1,41 @@
|
|
| 1 |
-
<script
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
})
|
| 25 |
-
]
|
| 26 |
-
});
|
| 27 |
-
wavesurfer.load('BASE64');
|
| 28 |
-
wavesurfer.on('ready', function () {
|
| 29 |
-
wavesurfer.play();
|
| 30 |
-
});
|
| 31 |
-
wavesurfer.on('play',function() {
|
| 32 |
-
document.getElementById('ppb').innerHTML = "Pause";
|
| 33 |
-
});
|
| 34 |
-
wavesurfer.on('pause',function() {
|
| 35 |
-
document.getElementById('ppb').innerHTML = "Play";
|
| 36 |
-
});
|
| 37 |
REGIONS
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
</script>
|
|
|
|
|
|
|
|
|
| 1 |
+
<script type="module">
|
| 2 |
+
import WaveSurfer from 'https://unpkg.com/wavesurfer.js@7/dist/wavesurfer.esm.js'
|
| 3 |
+
import RegionsPlugin from 'https://unpkg.com/wavesurfer.js@7/dist/plugins/regions.esm.js'
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
var labels=[];
|
| 7 |
+
const wavesurfer = WaveSurfer.create({
|
| 8 |
+
container: '#waveform',
|
| 9 |
+
barGap: 2,
|
| 10 |
+
barHeight: 3,
|
| 11 |
+
barWidth: 3,
|
| 12 |
+
barRadius: 2,
|
| 13 |
+
});
|
| 14 |
+
|
| 15 |
+
const regions = wavesurfer.registerPlugin(RegionsPlugin.create())
|
| 16 |
+
|
| 17 |
+
wavesurfer.load('BASE64');
|
| 18 |
+
wavesurfer.on('ready', function () {
|
| 19 |
+
wavesurfer.play();
|
| 20 |
+
});
|
| 21 |
+
|
| 22 |
+
wavesurfer.on('decode', function () {
|
| 23 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
REGIONS
|
| 25 |
+
|
| 26 |
+
wavesurfer.play();
|
| 27 |
+
|
| 28 |
+
});
|
| 29 |
+
|
| 30 |
+
wavesurfer.on('click', () => {
|
| 31 |
+
play();
|
| 32 |
+
});
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
function play(){
|
| 36 |
+
wavesurfer.isPlaying() ? wavesurfer.pause() : wavesurfer.play();
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
</script>
|
| 40 |
+
<div id="waveform"></div>
|
| 41 |
+
|
requirements.txt
CHANGED
|
@@ -1,7 +1 @@
|
|
| 1 |
-
|
| 2 |
-
torchvision==0.12.0
|
| 3 |
-
torchaudio==0.11.0
|
| 4 |
-
torchtext==0.12.0
|
| 5 |
-
speechbrain==0.5.12
|
| 6 |
-
pyannote-audio>=2.1
|
| 7 |
-
|
|
|
|
| 1 |
+
pyannote-audio==3.0.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|