-
Notifications
You must be signed in to change notification settings - Fork 18
/
convert.py
166 lines (131 loc) · 5.3 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""Convert Encodec checkpoint into the GGML format.
The bytes are packed in a binary file in the following order:
- Magic (`ggml` in binary format)
- Tensors
For each tensor, the bytes are packed as follows:
- Number of dimensions (int)
- Name length (int)
- Dimensions (int[n_dims])
- Name (char[name_length])
- Data (float[n_dims])
Note
----
Encodec uses weight normalization for its convolutional layers. All the weights are
decomposed into two tensors called with the suffixes _weight_v and _weight_g. A simple
call to the hook torch._weight_norm allows to get the final weight tensor of the
convolution from weight_v and weight_g. To drastically reduce the number of operations
at inference time, the ggml weights file only contain the final convolution weights but
does not store the decomposition into weight_v and weight_g.
Usage
-----
```bash
python convert.py \
--dir-model ./ggml_weights/ \
--out-dir ./ggml_weights/ \
--use-f16
```
"""
import argparse
from pathlib import Path
import struct
import numpy as np
import torch
parser = argparse.ArgumentParser()
parser.add_argument("--dir-model", type=str, required=True)
parser.add_argument("--out-dir", type=str, required=True)
parser.add_argument("--use-f16", action="store_true")
def parse_codec_model(checkpoint, outfile, use_f16):
"""Load encodec model checkpoint."""
n_f16, n_f32 = 0, 0
for name in checkpoint.keys():
if "weight_g" in name:
# the tensor has already been parsed with the corresponding "weight_v"
# tensor to form the final weights tensor of the convolution, therefore
# we skip it
continue
if "inited" in name or "cluster_size" in name or "embed_avg" in name:
# "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used
# for the forward pass
continue
var_data = checkpoint[name]
if not "weight_v" in name:
# if conv kernel, do not squeeze because 3d tensor
var_data = var_data.numpy().squeeze()
else:
# weight_v has its corresponding magnitude tensor to rescale the weights
# of the convolutional layers. We parse both kinds of weights jointly to
# build the final weight tensor of the convolution.
base_name = name.split(".")[:-1]
weight_g_name = ".".join(base_name + ["weight_g"])
var_data_g = checkpoint[weight_g_name]
final_var_data = torch._weight_norm(var_data, var_data_g, dim=0)
var_data = final_var_data.numpy()
name = ".".join(base_name + ["weight"])
print(f"Processing variable: {name} with shape: {var_data.shape}")
if use_f16:
if "embed" in name:
print(" Converting to float32")
var_data = var_data.astype(np.float32)
ftype_cur = 0
n_f32 += 1
elif "weight" in name:
print(" Converting to float16")
var_data = var_data.astype(np.float16)
ftype_cur = 1
n_f16 += 1
else:
print(" Converting to float32")
var_data = var_data.astype(np.float32)
ftype_cur = 0
n_f32 += 1
else:
print(" Converting to float32")
var_data = var_data.astype(np.float32)
ftype_cur = 0
n_f32 += 1
n_dims = len(var_data.shape)
encoded_name = name.encode("utf-8")
outfile.write(struct.pack("iii", n_dims, len(encoded_name), ftype_cur))
for i in range(n_dims):
outfile.write(struct.pack("i", var_data.shape[n_dims - 1 - i]))
outfile.write(encoded_name)
var_data.tofile(outfile)
outfile.close()
print("\n")
print(f"n_f16: {n_f16} ({n_f16/(n_f16 + n_f32)*100:.0f}%)")
print(f"n_f32: {n_f32} ({n_f32/(n_f16 + n_f32)*100:.0f}%)")
def parse_hparams(outfile, use_f16):
# for now this is hardcoded as we only support the 24Khz model
in_channels = 1
hidden_dim = 128
n_filters = 32
kernel_size = 7
residual_kernel_size = 3
n_bins = 1024
bandwidth = 24
sr = 24000
ftype = int(use_f16)
outfile.write(struct.pack("i", in_channels))
outfile.write(struct.pack("i", hidden_dim))
outfile.write(struct.pack("i", n_filters))
outfile.write(struct.pack("i", kernel_size))
outfile.write(struct.pack("i", residual_kernel_size))
outfile.write(struct.pack("i", n_bins))
outfile.write(struct.pack("i", bandwidth))
outfile.write(struct.pack("i", sr))
outfile.write(struct.pack("i", ftype))
if __name__ == "__main__":
args = parser.parse_args()
dir_model = Path(args.dir_model)
out_dir = Path(args.out_dir)
out_dir.mkdir(exist_ok=True, parents=True)
outfile = Path(out_dir / "ggml-model.bin")
checkpoint = torch.load(dir_model / "encodec_24khz-d7cc33bc.th", map_location="cpu")
# Step 1: insert ggml magic
outfile = open(outfile, "wb")
outfile.write(struct.pack("i", 0x67676d6c))
# Step 2: insert hyperparameters
parse_hparams(outfile, args.use_f16)
# Step 3: insert weights
parse_codec_model(checkpoint, outfile, args.use_f16)
print("Done.")