183 lines
8.1 KiB
Python
Executable File
183 lines
8.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import struct
|
|
import sys
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
def write_tensor(fout, name, data):
|
|
n_dims = len(data.shape)
|
|
data = data.astype(np.float32)
|
|
ftype = 0 # GGML_TYPE_F32
|
|
|
|
name_bytes = name.encode('utf-8')
|
|
fout.write(struct.pack("iii", n_dims, len(name_bytes), ftype))
|
|
for i in range(n_dims):
|
|
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
|
|
fout.write(name_bytes)
|
|
data.tofile(fout)
|
|
|
|
def generate(output_path):
|
|
rng = np.random.default_rng(42)
|
|
|
|
hparams = {
|
|
'n_vocab': 10,
|
|
'n_audio_ctx': 3200,
|
|
'n_audio_state': 8,
|
|
'n_audio_head': 2,
|
|
'n_audio_layer': 1,
|
|
'n_mels': 16,
|
|
'ftype': 0,
|
|
'n_fft': 64,
|
|
'subsampling_factor': 8,
|
|
'n_subsampling_channels': 4,
|
|
'n_conv_kernel': 3,
|
|
'n_pred_dim': 8,
|
|
'n_pred_layers': 1,
|
|
'n_tdt_durations': 2,
|
|
'n_max_tokens': 5,
|
|
}
|
|
|
|
n_vocab = hparams['n_vocab']
|
|
n_state = hparams['n_audio_state']
|
|
n_head = hparams['n_audio_head']
|
|
n_layer = hparams['n_audio_layer']
|
|
n_mels = hparams['n_mels']
|
|
n_fft = hparams['n_fft']
|
|
n_sub_fac = hparams['subsampling_factor']
|
|
n_sub_ch = hparams['n_subsampling_channels']
|
|
n_conv_ker = hparams['n_conv_kernel']
|
|
dec_dim = hparams['n_pred_dim']
|
|
n_pred_l = hparams['n_pred_layers']
|
|
n_tdt = hparams['n_tdt_durations']
|
|
|
|
n_pre_enc = (n_mels // n_sub_fac) * n_sub_ch
|
|
n_head_dim = n_state // n_head
|
|
n_pred_embed = n_vocab + 1
|
|
n_lstm_gates = 4 * dec_dim
|
|
n_joint_out = n_vocab + n_tdt + 1
|
|
n_freqs = n_fft // 2 + 1
|
|
|
|
def f32(*shape):
|
|
return rng.standard_normal(shape).astype(np.float32)
|
|
|
|
with open(output_path, 'wb') as fout:
|
|
fout.write(struct.pack("I", 0x67676d6c))
|
|
|
|
for key in ['n_vocab',
|
|
'n_audio_ctx',
|
|
'n_audio_state',
|
|
'n_audio_head',
|
|
'n_audio_layer',
|
|
'n_mels',
|
|
'ftype',
|
|
'n_fft',
|
|
'subsampling_factor',
|
|
'n_subsampling_channels',
|
|
'n_conv_kernel',
|
|
'n_pred_dim',
|
|
'n_pred_layers',
|
|
'n_tdt_durations',
|
|
'n_max_tokens']:
|
|
fout.write(struct.pack("i", hparams[key]))
|
|
|
|
fout.write(struct.pack("i", n_mels))
|
|
fout.write(struct.pack("i", n_freqs))
|
|
f32(n_mels, n_freqs).tofile(fout)
|
|
|
|
fout.write(struct.pack("i", n_fft))
|
|
f32(n_fft).tofile(fout)
|
|
|
|
for d in range(n_tdt):
|
|
fout.write(struct.pack("I", d))
|
|
|
|
tokens = ['<unk>', '<s>', '</s>'] + [chr(ord('a') + i) for i in range(n_vocab - 3)]
|
|
assert len(tokens) == n_vocab
|
|
fout.write(struct.pack("i", n_vocab))
|
|
for tok in tokens:
|
|
tok_bytes = tok.encode('utf-8')
|
|
fout.write(struct.pack("i", len(tok_bytes)))
|
|
fout.write(tok_bytes)
|
|
|
|
write_tensor(fout, "encoder.pre_encode.out.weight", f32(n_state, n_pre_enc))
|
|
write_tensor(fout, "encoder.pre_encode.out.bias", f32(n_state))
|
|
|
|
write_tensor(fout, "encoder.pre_encode.conv.0.weight", f32(n_sub_ch, 1, 3, 3))
|
|
write_tensor(fout, "encoder.pre_encode.conv.0.bias", f32(1, n_sub_ch, 1, 1))
|
|
|
|
write_tensor(fout, "encoder.pre_encode.conv.2.weight", f32(n_sub_ch, 1, 3, 3))
|
|
write_tensor(fout, "encoder.pre_encode.conv.2.bias", f32(1, n_sub_ch, 1, 1))
|
|
|
|
write_tensor(fout, "encoder.pre_encode.conv.3.weight", f32(n_sub_ch, n_sub_ch, 1, 1))
|
|
write_tensor(fout, "encoder.pre_encode.conv.3.bias", f32(1, n_sub_ch, 1, 1))
|
|
|
|
write_tensor(fout, "encoder.pre_encode.conv.5.weight", f32(n_sub_ch, 1, 3, 3))
|
|
write_tensor(fout, "encoder.pre_encode.conv.5.bias", f32(1, n_sub_ch, 1, 1))
|
|
|
|
write_tensor(fout, "encoder.pre_encode.conv.6.weight", f32(n_sub_ch, n_sub_ch, 1, 1))
|
|
write_tensor(fout, "encoder.pre_encode.conv.6.bias", f32(1, n_sub_ch, 1, 1))
|
|
|
|
for i in range(n_layer):
|
|
p = f"encoder.layers.{i}"
|
|
|
|
write_tensor(fout, f"{p}.norm_feed_forward1.weight", f32(n_state))
|
|
write_tensor(fout, f"{p}.norm_feed_forward1.bias", f32(n_state))
|
|
write_tensor(fout, f"{p}.feed_forward1.linear1.weight", f32(4*n_state, n_state))
|
|
write_tensor(fout, f"{p}.feed_forward1.linear2.weight", f32(n_state, 4*n_state))
|
|
|
|
write_tensor(fout, f"{p}.norm_conv.weight", f32(n_state))
|
|
write_tensor(fout, f"{p}.norm_conv.bias", f32(n_state))
|
|
write_tensor(fout, f"{p}.conv.pointwise_conv1.weight", f32(2*n_state, n_state))
|
|
write_tensor(fout, f"{p}.conv.depthwise_conv.weight", f32(n_state, n_conv_ker))
|
|
write_tensor(fout, f"{p}.conv.batch_norm.weight", f32(n_state))
|
|
write_tensor(fout, f"{p}.conv.batch_norm.bias", f32(n_state))
|
|
write_tensor(fout, f"{p}.conv.batch_norm.running_mean", f32(n_state))
|
|
write_tensor(fout, f"{p}.conv.batch_norm.running_var", np.abs(f32(n_state)))
|
|
num_batches = np.zeros(1, dtype=np.int32)
|
|
write_tensor(fout, f"{p}.conv.batch_norm.num_batches_tracked", num_batches)
|
|
write_tensor(fout, f"{p}.conv.pointwise_conv2.weight", f32(n_state, n_state))
|
|
|
|
write_tensor(fout, f"{p}.norm_self_att.weight", f32(n_state))
|
|
write_tensor(fout, f"{p}.norm_self_att.bias", f32(n_state))
|
|
|
|
write_tensor(fout, f"{p}.self_attn.pos_bias_u", f32(n_head, n_head_dim))
|
|
write_tensor(fout, f"{p}.self_attn.pos_bias_v", f32(n_head, n_head_dim))
|
|
write_tensor(fout, f"{p}.self_attn.linear_q.weight", f32(n_state, n_state))
|
|
write_tensor(fout, f"{p}.self_attn.linear_k.weight", f32(n_state, n_state))
|
|
write_tensor(fout, f"{p}.self_attn.linear_v.weight", f32(n_state, n_state))
|
|
write_tensor(fout, f"{p}.self_attn.linear_out.weight", f32(n_state, n_state))
|
|
write_tensor(fout, f"{p}.self_attn.linear_pos.weight", f32(n_state, n_state))
|
|
|
|
write_tensor(fout, f"{p}.norm_feed_forward2.weight", f32(n_state))
|
|
write_tensor(fout, f"{p}.norm_feed_forward2.bias", f32(n_state))
|
|
write_tensor(fout, f"{p}.feed_forward2.linear1.weight", f32(4*n_state, n_state))
|
|
write_tensor(fout, f"{p}.feed_forward2.linear2.weight", f32(n_state, 4*n_state))
|
|
|
|
write_tensor(fout, f"{p}.norm_out.weight", f32(n_state))
|
|
write_tensor(fout, f"{p}.norm_out.bias", f32(n_state))
|
|
|
|
write_tensor(fout, "decoder.prediction.embed.weight", f32(n_pred_embed, dec_dim))
|
|
|
|
def reorder_gates(data):
|
|
h = data.shape[0] // 4
|
|
return np.concatenate([data[:h], data[h:2*h], data[3*h:], data[2*h:3*h]], axis=0)
|
|
|
|
for i in range(n_pred_l):
|
|
base = f"decoder.prediction.dec_rnn.lstm"
|
|
write_tensor(fout, f"{base}.weight_ih_l{i}", reorder_gates(f32(n_lstm_gates, dec_dim)))
|
|
write_tensor(fout, f"{base}.weight_hh_l{i}", reorder_gates(f32(n_lstm_gates, dec_dim)))
|
|
write_tensor(fout, f"{base}.bias_h_l{i}", reorder_gates(f32(n_lstm_gates) + f32(n_lstm_gates)))
|
|
|
|
write_tensor(fout, "joint.pred.weight", f32(dec_dim, dec_dim))
|
|
write_tensor(fout, "joint.pred.bias", f32(dec_dim))
|
|
write_tensor(fout, "joint.enc.weight", f32(dec_dim, n_state))
|
|
write_tensor(fout, "joint.enc.bias", f32(dec_dim))
|
|
write_tensor(fout, "joint.joint_net.2.weight", f32(n_joint_out, dec_dim))
|
|
write_tensor(fout, "joint.joint_net.2.bias", f32(n_joint_out))
|
|
|
|
size = Path(output_path).stat().st_size
|
|
print(f"Generated {output_path} ({size / 1024:.1f} KB)")
|
|
|
|
if __name__ == '__main__':
|
|
output = sys.argv[1] if len(sys.argv) > 1 else 'models/for-tests-ggml-parakeet-tdt.bin'
|
|
generate(output)
|