--- language: - en - zh - de - es - ru - ko - fr - ja - pt - tr - pl - ca - nl - ar - sv - it - id - hi - fi - vi - he - uk - el - ms - cs - ro - da - hu - ta - 'no' - th - ur - hr - bg - lt - la - mi - ml - cy - sk - te - fa - lv - bn - sr - az - sl - kn - et - mk - br - eu - is - hy - ne - mn - bs - kk - sq - sw - gl - mr - pa - si - km - sn - yo - so - af - oc - ka - be - tg - sd - gu - am - yi - lo - uz - fo - ht - ps - tk - nn - mt - sa - lb - my - bo - tl - mg - as - tt - haw - ln - ha - ba - jw - su tags: - audio - automatic-speech-recognition - hf-asr-leaderboard pipeline_tag: automatic-speech-recognition license: apache-2.0 license_link: https://choosealicense.com/licenses/apache-2.0/ base_model: - openai/whisper-large-v3-turbo --- # whisper-large-v3-fp16-ov * Model creator: [OpenAI](https://huggingface.co/openai) * Original model: [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) ## Description This is [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) model converted to the [OpenVINO™ IR](https://docs.openvino.ai/2025/documentation/openvino-ir-format.html) (Intermediate Representation) format with weights compressed to FP16. ## Compatibility The provided OpenVINO™ IR model is compatible with: * OpenVINO version 2025.2.0 and higher * Optimum Intel 1.23.0 and higher ```bash optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo --weight-format int4 --disable-stateful whisper-large-v3-turbo-int4-ov ``` ```python #!/usr/bin/env python3 import time import requests import openvino_genai import librosa from pathlib import Path from huggingface_hub import snapshot_download def download_model(model_id="FluidInference/whisper-large-v3-turbo-int4-ov-npu"): """Download model from HuggingFace Hub""" local_dir = Path("models") / model_id.split("/")[-1] if local_dir.exists() and any(local_dir.iterdir()): return str(local_dir) print(f"Downloading model...") snapshot_download( repo_id=model_id, local_dir=str(local_dir), local_dir_use_symlinks=False ) return str(local_dir) def download_hf_audio_samples(): """Download audio samples from Hugging Face""" samples_dir = Path("sample_audios") samples_dir.mkdir(exist_ok=True) downloaded = [] whisper_samples = [ ("https://cdn-media.huggingface.co/speech_samples/sample1.flac", "sample1.flac"), ("https://cdn-media.huggingface.co/speech_samples/sample2.flac", "sample2.flac"), ] for url, filename in whisper_samples: filepath = samples_dir / filename if filepath.exists(): downloaded.append(str(filepath)) continue try: response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) response.raise_for_status() with open(filepath, 'wb') as f: f.write(response.content) downloaded.append(str(filepath)) except Exception as e: print(f"Error downloading {filename}: {e}") return downloaded def read_audio(filepath): """Read audio file and convert to 16kHz""" try: raw_speech, _ = librosa.load(filepath, sr=16000) return raw_speech.tolist() except Exception as e: print(f"Error reading {filepath}: {e}") return None def test_whisper_on_file(pipe, filepath): """Test Whisper on a single audio file""" config = pipe.get_generation_config() config.language = "<|en|>" config.task = "transcribe" config.return_timestamps = True config.max_new_tokens = 448 raw_speech = read_audio(filepath) if raw_speech is None: return None duration = len(raw_speech) / 16000 start_time = time.time() result = pipe.generate(raw_speech, config) inference_time = time.time() - start_time return { "file": filepath, "duration": duration, "inference_time": inference_time, "rtf": inference_time/duration, "transcription": str(result) } def main(): # Download model model_path = download_model() # Initialize pipeline on NPU print(f"\nInitializing NPU...") start_time = time.time() pipe = openvino_genai.WhisperPipeline(model_path, "NPU") init_time = time.time() - start_time results = [] # Collect test files test_files = [] test_files.extend(Path(".").glob("*.wav")) if Path("samples/c/whisper_speech_recognition").exists(): test_files.extend(Path("samples/c/whisper_speech_recognition").glob("*.wav")) # Download HF samples hf_samples = download_hf_audio_samples() test_files.extend([Path(f) for f in hf_samples]) # Test all files print(f"\nTesting {len(test_files)} files...") for audio_file in test_files: result = test_whisper_on_file(pipe, str(audio_file)) if result: results.append(result) print(f"[OK] {Path(result['file']).name}: RTF={result['rtf']:.2f}x") # Print summary if results: total_duration = sum(r["duration"] for r in results) total_inference = sum(r["inference_time"] for r in results) avg_rtf = total_inference / total_duration print(f"\n{'='*50}") print(f"NPU Performance Summary") print(f"{'='*50}") print(f"Model load time: {init_time:.1f}s") print(f"Files tested: {len(results)}") print(f"Total audio: {total_duration:.1f}s") print(f"Total inference: {total_inference:.1f}s") print(f"Average RTF: {avg_rtf:.2f}x {'[Faster than real-time]' if avg_rtf < 1 else '[Slower than real-time]'}") print(f"\nResults:") for r in results: trans = r['transcription'].strip() if len(trans) > 60: trans = trans[:57] + "..." print(f"- {Path(r['file']).name}: \"{trans}\"") if __name__ == "__main__": main() ```