Commit
·
3c5a93a
1
Parent(s):
f974bb6
up
Browse files- convert.py +194 -0
- whisper-32-2.pt +3 -0
convert.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""Converts a Whisper model in Hugging Face format to OpenAI format.
|
| 3 |
+
This script is based on the following script to do the opposite:
|
| 4 |
+
https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/convert_openai_to_hf.py
|
| 5 |
+
Requirements:
|
| 6 |
+
```bash
|
| 7 |
+
pip install -U openai-whisper
|
| 8 |
+
```
|
| 9 |
+
Example:
|
| 10 |
+
```bash
|
| 11 |
+
# Converts the model from Hugging Face to OpenAI format:
|
| 12 |
+
python convert_hf_to_openai.py \
|
| 13 |
+
--checkpoint openai/whisper-tiny \
|
| 14 |
+
--whisper_dump_path whisper-tiny-openai.pt
|
| 15 |
+
```
|
| 16 |
+
```python
|
| 17 |
+
>>> # Disabled doctest because it requries the openai-whisper package.
|
| 18 |
+
>> import whisper
|
| 19 |
+
>> from transformers.models.whisper.convert_hf_to_openai import convert_tfms_to_openai_whisper
|
| 20 |
+
>> # Converts the model from Hugging Face to OpenAI format:
|
| 21 |
+
>> convert_tfms_to_openai_whisper(
|
| 22 |
+
.. "openai/whisper-tiny", "whisper-tiny-openai.pt"
|
| 23 |
+
.. )
|
| 24 |
+
HF model path: openai/whisper-tiny
|
| 25 |
+
OpenAI model path: whisper-tiny-openai.pt
|
| 26 |
+
>> # Select an audio file:
|
| 27 |
+
>> audio_path = "https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav"
|
| 28 |
+
>> # Load the Whisper model in OpenAI format:
|
| 29 |
+
>> model = whisper.load_model("whisper-tiny-openai.pt")
|
| 30 |
+
>> # Transcribe the audio:
|
| 31 |
+
>> prediction = model.transcribe(audio_path)
|
| 32 |
+
>> prediction["text"][:70]
|
| 33 |
+
' chapter 16. I might have told you of the beginning of this liaison in'
|
| 34 |
+
```
|
| 35 |
+
"""
|
| 36 |
+
# Copyright 2023 Xabier de Zuazo and the Aholab team. All rights reserved.
|
| 37 |
+
#
|
| 38 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 39 |
+
# you may not use this file except in compliance with the License.
|
| 40 |
+
# You may obtain a copy of the License at
|
| 41 |
+
#
|
| 42 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 43 |
+
#
|
| 44 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 45 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 46 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 47 |
+
# See the License for the specific language governing permissions and
|
| 48 |
+
# limitations under the License.
|
| 49 |
+
|
| 50 |
+
import argparse
|
| 51 |
+
|
| 52 |
+
import torch
|
| 53 |
+
from torch import nn
|
| 54 |
+
|
| 55 |
+
from transformers import WhisperConfig, WhisperForConditionalGeneration
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# Create the reverse mapping adapting it from the original `WHISPER_MAPPING` in
|
| 59 |
+
# the `convert_openai_to_hf.py` script:
|
| 60 |
+
REVERSE_WHISPER_MAPPING = {
|
| 61 |
+
"layers": "blocks",
|
| 62 |
+
"fc1": "mlp.0",
|
| 63 |
+
"fc2": "mlp.2",
|
| 64 |
+
"final_layer_norm": "mlp_ln",
|
| 65 |
+
".self_attn.q_proj": ".attn.query",
|
| 66 |
+
".self_attn.k_proj": ".attn.key",
|
| 67 |
+
".self_attn.v_proj": ".attn.value",
|
| 68 |
+
".self_attn_layer_norm": ".attn_ln",
|
| 69 |
+
".self_attn.out_proj": ".attn.out",
|
| 70 |
+
".encoder_attn.q_proj": ".cross_attn.query",
|
| 71 |
+
".encoder_attn.k_proj": ".cross_attn.key",
|
| 72 |
+
".encoder_attn.v_proj": ".cross_attn.value",
|
| 73 |
+
".encoder_attn_layer_norm": ".cross_attn_ln",
|
| 74 |
+
".encoder_attn.out_proj": ".cross_attn.out",
|
| 75 |
+
"decoder.layer_norm.": "decoder.ln.",
|
| 76 |
+
"encoder.layer_norm.": "encoder.ln_post.",
|
| 77 |
+
"embed_tokens": "token_embedding",
|
| 78 |
+
"encoder.embed_positions.weight": "encoder.positional_embedding",
|
| 79 |
+
"decoder.embed_positions.weight": "decoder.positional_embedding",
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def reverse_rename_keys(s_dict: dict) -> dict:
|
| 84 |
+
"""Renames the keys back from Hugging Face to OpenAI Whisper format.
|
| 85 |
+
By using this function on an HF model's state_dict, we should get the names in the format expected by Whisper.
|
| 86 |
+
Args:
|
| 87 |
+
s_dict (`dict`): A dictionary with keys in Hugging Face format.
|
| 88 |
+
Returns:
|
| 89 |
+
`dict`: The same dictionary but in OpenAI Whisper format.
|
| 90 |
+
"""
|
| 91 |
+
keys = list(s_dict.keys())
|
| 92 |
+
for orig_key in keys:
|
| 93 |
+
new_key = orig_key
|
| 94 |
+
for key_r, value_r in REVERSE_WHISPER_MAPPING.items():
|
| 95 |
+
if key_r in orig_key:
|
| 96 |
+
new_key = new_key.replace(key_r, value_r)
|
| 97 |
+
|
| 98 |
+
# print(f"{orig_key} -> {new_key}")
|
| 99 |
+
|
| 100 |
+
s_dict[new_key] = s_dict.pop(orig_key)
|
| 101 |
+
return s_dict
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def make_emb_from_linear(linear: nn.Linear) -> nn.Embedding:
|
| 105 |
+
"""Converts a linear layer's weights into an embedding layer.
|
| 106 |
+
The linear layer's `in_features` dimension corresponds to the vocabulary size and its `out_features` dimension
|
| 107 |
+
corresponds to the embedding size.
|
| 108 |
+
Args:
|
| 109 |
+
linear (`nn.Linear`): The linear layer to be converted.
|
| 110 |
+
Returns:
|
| 111 |
+
`nn.Embedding`:
|
| 112 |
+
An embedding layer with weights set to those of the input linear layer.
|
| 113 |
+
"""
|
| 114 |
+
vocab_size, emb_size = linear.weight.data.shape
|
| 115 |
+
emb_layer = nn.Embedding(vocab_size, emb_size, _weight=linear.weight.data)
|
| 116 |
+
return emb_layer
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def extract_dims_from_hf(config: WhisperConfig) -> dict:
|
| 120 |
+
"""Extracts necessary dimensions from Hugging Face's WhisperConfig.
|
| 121 |
+
Extracts necessary dimensions and related configuration data from the Hugging Face model and then restructure it
|
| 122 |
+
for the OpenAI Whisper format.
|
| 123 |
+
Args:
|
| 124 |
+
config (`WhisperConfig`): Configuration of the Hugging Face's model.
|
| 125 |
+
Returns:
|
| 126 |
+
`dict`: The `dims` of the OpenAI Whisper model.
|
| 127 |
+
"""
|
| 128 |
+
dims = {
|
| 129 |
+
"n_vocab": config.vocab_size,
|
| 130 |
+
"n_mels": config.num_mel_bins,
|
| 131 |
+
"n_audio_state": config.d_model,
|
| 132 |
+
"n_text_ctx": config.max_target_positions,
|
| 133 |
+
"n_audio_layer": config.encoder_layers,
|
| 134 |
+
"n_audio_head": config.encoder_attention_heads,
|
| 135 |
+
"n_text_layer": config.decoder_layers,
|
| 136 |
+
"n_text_head": config.decoder_attention_heads,
|
| 137 |
+
"n_text_state": config.d_model,
|
| 138 |
+
"n_audio_ctx": config.max_source_positions,
|
| 139 |
+
}
|
| 140 |
+
return dims
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def convert_tfms_to_openai_whisper(hf_model_path: str, whisper_dump_path: str):
|
| 144 |
+
"""Converts a Whisper model from the Hugging Face to the OpenAI format.
|
| 145 |
+
Takes in the path to a Hugging Face Whisper model, extracts its state_dict, renames keys as needed, and then saves
|
| 146 |
+
the model OpenAI's format.
|
| 147 |
+
Args:
|
| 148 |
+
hf_model_path (`str`):
|
| 149 |
+
Path to the pretrained Whisper model in Hugging Face format.
|
| 150 |
+
whisper_dump_path (`str`):
|
| 151 |
+
Destination path where the converted model in Whisper/OpenAI format will be saved.
|
| 152 |
+
Returns:
|
| 153 |
+
`None`
|
| 154 |
+
"""
|
| 155 |
+
print("HF model path:", hf_model_path)
|
| 156 |
+
print("OpenAI model path:", whisper_dump_path)
|
| 157 |
+
|
| 158 |
+
# Load the HF model and its state_dict
|
| 159 |
+
model = WhisperForConditionalGeneration.from_pretrained(hf_model_path)
|
| 160 |
+
state_dict = model.state_dict()
|
| 161 |
+
|
| 162 |
+
# Use a reverse mapping to rename state_dict keys
|
| 163 |
+
state_dict = reverse_rename_keys(state_dict)
|
| 164 |
+
|
| 165 |
+
# Extract configurations and other necessary metadata
|
| 166 |
+
dims = extract_dims_from_hf(model.config)
|
| 167 |
+
|
| 168 |
+
# Remove the proj_out weights from state dictionary
|
| 169 |
+
del state_dict["proj_out.weight"]
|
| 170 |
+
|
| 171 |
+
# Construct the Whisper checkpoint structure
|
| 172 |
+
state_dict = {k.replace("model.", "", 1): v for k, v in state_dict.items()}
|
| 173 |
+
whisper_checkpoint = {"dims": dims, "model_state_dict": state_dict}
|
| 174 |
+
|
| 175 |
+
# Save in Whisper's format
|
| 176 |
+
torch.save(whisper_checkpoint, whisper_dump_path)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
if __name__ == "__main__":
|
| 180 |
+
parser = argparse.ArgumentParser()
|
| 181 |
+
# Required parameters
|
| 182 |
+
parser.add_argument(
|
| 183 |
+
"--checkpoint",
|
| 184 |
+
type=str,
|
| 185 |
+
help="Path of name of the Hugging Face checkpoint.", # noqa: E501
|
| 186 |
+
)
|
| 187 |
+
parser.add_argument(
|
| 188 |
+
"--whisper_dump_path",
|
| 189 |
+
type=str,
|
| 190 |
+
help="Path to the output Whisper model.", # noqa: E501
|
| 191 |
+
)
|
| 192 |
+
args = parser.parse_args()
|
| 193 |
+
|
| 194 |
+
convert_tfms_to_openai_whisper(args.checkpoint, args.whisper_dump_path)
|
whisper-32-2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2eadb7893a248cab031b175a7cdf09a3e19f525fa3fb9ae1f33a9e036bd985c
|
| 3 |
+
size 3025049815
|