whisper.cpp/models/generate-parakeet-test-mode...

#!/usr/bin/env python3
import struct
import sys
import numpy as np
from pathlib import Path

def write_tensor(fout, name, data):
    n_dims = len(data.shape)
    data = data.astype(np.float32)
    ftype = 0  # GGML_TYPE_F32

    name_bytes = name.encode('utf-8')
    fout.write(struct.pack("iii", n_dims, len(name_bytes), ftype))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
    fout.write(name_bytes)
    data.tofile(fout)

def generate(output_path):
    rng = np.random.default_rng(42)

    hparams = {
        'n_vocab':                10,
        'n_audio_ctx':            3200,
        'n_audio_state':          8,
        'n_audio_head':           2,
        'n_audio_layer':          1,
        'n_mels':                 16,
        'ftype':                  0,
        'n_fft':                  64,
        'subsampling_factor':     8,
        'n_subsampling_channels': 4,
        'n_conv_kernel':          3,
        'n_pred_dim':             8,
        'n_pred_layers':          1,
        'n_tdt_durations':        2,
        'n_max_tokens':           5,
    }

    n_vocab    = hparams['n_vocab']
    n_state    = hparams['n_audio_state']
    n_head     = hparams['n_audio_head']
    n_layer    = hparams['n_audio_layer']
    n_mels     = hparams['n_mels']
    n_fft      = hparams['n_fft']
    n_sub_fac  = hparams['subsampling_factor']
    n_sub_ch   = hparams['n_subsampling_channels']
    n_conv_ker = hparams['n_conv_kernel']
    dec_dim    = hparams['n_pred_dim']
    n_pred_l   = hparams['n_pred_layers']
    n_tdt      = hparams['n_tdt_durations']

    n_pre_enc     = (n_mels // n_sub_fac) * n_sub_ch
    n_head_dim    = n_state // n_head
    n_pred_embed  = n_vocab + 1
    n_lstm_gates  = 4 * dec_dim
    n_joint_out   = n_vocab + n_tdt + 1
    n_freqs       = n_fft // 2 + 1

    def f32(*shape):
        return rng.standard_normal(shape).astype(np.float32)

    with open(output_path, 'wb') as fout:
        fout.write(struct.pack("I", 0x67676d6c))

        for key in ['n_vocab',
                    'n_audio_ctx',
                    'n_audio_state',
                    'n_audio_head',
                    'n_audio_layer',
                    'n_mels',
                    'ftype',
                    'n_fft',
                    'subsampling_factor',
                    'n_subsampling_channels',
                    'n_conv_kernel',
                    'n_pred_dim',
                    'n_pred_layers',
                    'n_tdt_durations',
                    'n_max_tokens']:
            fout.write(struct.pack("i", hparams[key]))

        fout.write(struct.pack("i", n_mels))
        fout.write(struct.pack("i", n_freqs))
        f32(n_mels, n_freqs).tofile(fout)

        fout.write(struct.pack("i", n_fft))
        f32(n_fft).tofile(fout)

        for d in range(n_tdt):
            fout.write(struct.pack("I", d))

        tokens = ['<unk>', '<s>', '</s>'] + [chr(ord('a') + i) for i in range(n_vocab - 3)]
        assert len(tokens) == n_vocab
        fout.write(struct.pack("i", n_vocab))
        for tok in tokens:
            tok_bytes = tok.encode('utf-8')
            fout.write(struct.pack("i", len(tok_bytes)))
            fout.write(tok_bytes)

        write_tensor(fout, "encoder.pre_encode.out.weight",    f32(n_state, n_pre_enc))
        write_tensor(fout, "encoder.pre_encode.out.bias",      f32(n_state))

        write_tensor(fout, "encoder.pre_encode.conv.0.weight", f32(n_sub_ch, 1, 3, 3))
        write_tensor(fout, "encoder.pre_encode.conv.0.bias",   f32(1, n_sub_ch, 1, 1))

        write_tensor(fout, "encoder.pre_encode.conv.2.weight", f32(n_sub_ch, 1, 3, 3))
        write_tensor(fout, "encoder.pre_encode.conv.2.bias",   f32(1, n_sub_ch, 1, 1))

        write_tensor(fout, "encoder.pre_encode.conv.3.weight", f32(n_sub_ch, n_sub_ch, 1, 1))
        write_tensor(fout, "encoder.pre_encode.conv.3.bias",   f32(1, n_sub_ch, 1, 1))

        write_tensor(fout, "encoder.pre_encode.conv.5.weight", f32(n_sub_ch, 1, 3, 3))
        write_tensor(fout, "encoder.pre_encode.conv.5.bias",   f32(1, n_sub_ch, 1, 1))

        write_tensor(fout, "encoder.pre_encode.conv.6.weight", f32(n_sub_ch, n_sub_ch, 1, 1))
        write_tensor(fout, "encoder.pre_encode.conv.6.bias",   f32(1, n_sub_ch, 1, 1))

        for i in range(n_layer):
            p = f"encoder.layers.{i}"

            write_tensor(fout, f"{p}.norm_feed_forward1.weight",     f32(n_state))
            write_tensor(fout, f"{p}.norm_feed_forward1.bias",       f32(n_state))
            write_tensor(fout, f"{p}.feed_forward1.linear1.weight",  f32(4*n_state, n_state))
            write_tensor(fout, f"{p}.feed_forward1.linear2.weight",  f32(n_state, 4*n_state))

            write_tensor(fout, f"{p}.norm_conv.weight",              f32(n_state))
            write_tensor(fout, f"{p}.norm_conv.bias",                f32(n_state))
            write_tensor(fout, f"{p}.conv.pointwise_conv1.weight",   f32(2*n_state, n_state))
            write_tensor(fout, f"{p}.conv.depthwise_conv.weight",    f32(n_state, n_conv_ker))
            write_tensor(fout, f"{p}.conv.batch_norm.weight",        f32(n_state))
            write_tensor(fout, f"{p}.conv.batch_norm.bias",          f32(n_state))
            write_tensor(fout, f"{p}.conv.batch_norm.running_mean",  f32(n_state))
            write_tensor(fout, f"{p}.conv.batch_norm.running_var",   np.abs(f32(n_state)))
            num_batches = np.zeros(1, dtype=np.int32)
            write_tensor(fout, f"{p}.conv.batch_norm.num_batches_tracked", num_batches)
            write_tensor(fout, f"{p}.conv.pointwise_conv2.weight",   f32(n_state, n_state))

            write_tensor(fout, f"{p}.norm_self_att.weight",          f32(n_state))
            write_tensor(fout, f"{p}.norm_self_att.bias",            f32(n_state))

            write_tensor(fout, f"{p}.self_attn.pos_bias_u",          f32(n_head, n_head_dim))
            write_tensor(fout, f"{p}.self_attn.pos_bias_v",          f32(n_head, n_head_dim))
            write_tensor(fout, f"{p}.self_attn.linear_q.weight",     f32(n_state, n_state))
            write_tensor(fout, f"{p}.self_attn.linear_k.weight",     f32(n_state, n_state))
            write_tensor(fout, f"{p}.self_attn.linear_v.weight",     f32(n_state, n_state))
            write_tensor(fout, f"{p}.self_attn.linear_out.weight",   f32(n_state, n_state))
            write_tensor(fout, f"{p}.self_attn.linear_pos.weight",   f32(n_state, n_state))

            write_tensor(fout, f"{p}.norm_feed_forward2.weight",     f32(n_state))
            write_tensor(fout, f"{p}.norm_feed_forward2.bias",       f32(n_state))
            write_tensor(fout, f"{p}.feed_forward2.linear1.weight",  f32(4*n_state, n_state))
            write_tensor(fout, f"{p}.feed_forward2.linear2.weight",  f32(n_state, 4*n_state))

            write_tensor(fout, f"{p}.norm_out.weight",               f32(n_state))
            write_tensor(fout, f"{p}.norm_out.bias",                 f32(n_state))

        write_tensor(fout, "decoder.prediction.embed.weight", f32(n_pred_embed, dec_dim))

        def reorder_gates(data):
            h = data.shape[0] // 4
            return np.concatenate([data[:h], data[h:2*h], data[3*h:], data[2*h:3*h]], axis=0)

        for i in range(n_pred_l):
            base = f"decoder.prediction.dec_rnn.lstm"
            write_tensor(fout, f"{base}.weight_ih_l{i}", reorder_gates(f32(n_lstm_gates, dec_dim)))
            write_tensor(fout, f"{base}.weight_hh_l{i}", reorder_gates(f32(n_lstm_gates, dec_dim)))
            write_tensor(fout, f"{base}.bias_h_l{i}",    reorder_gates(f32(n_lstm_gates) + f32(n_lstm_gates)))

        write_tensor(fout, "joint.pred.weight",        f32(dec_dim, dec_dim))
        write_tensor(fout, "joint.pred.bias",          f32(dec_dim))
        write_tensor(fout, "joint.enc.weight",         f32(dec_dim, n_state))
        write_tensor(fout, "joint.enc.bias",           f32(dec_dim))
        write_tensor(fout, "joint.joint_net.2.weight", f32(n_joint_out, dec_dim))
        write_tensor(fout, "joint.joint_net.2.bias",   f32(n_joint_out))

    size = Path(output_path).stat().st_size
    print(f"Generated {output_path} ({size / 1024:.1f} KB)")

if __name__ == '__main__':
    output = sys.argv[1] if len(sys.argv) > 1 else 'models/for-tests-ggml-parakeet-tdt.bin'
    generate(output)