Compare commits
243 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
4bf733672b | |
|
|
18162bcf61 | |
|
|
8384aa8086 | |
|
|
bbdaa21aa7 | |
|
|
a5a8496d31 | |
|
|
28f8534532 | |
|
|
4861a3eeb5 | |
|
|
f2ce24fa5c | |
|
|
9623c1203b | |
|
|
95053f68e4 | |
|
|
35cb684129 | |
|
|
e10025351c | |
|
|
ccd04522f9 | |
|
|
b34a9f3d83 | |
|
|
0c7c3ba570 | |
|
|
582d2562a4 | |
|
|
d74c56862b | |
|
|
66392cf1a2 | |
|
|
aec8e69c2f | |
|
|
9f2cec1840 | |
|
|
c59a773605 | |
|
|
320c048724 | |
|
|
ad670182d9 | |
|
|
44e7803661 | |
|
|
6119537e9a | |
|
|
fa20229eeb | |
|
|
3076725eb0 | |
|
|
5301139374 | |
|
|
c200b588f8 | |
|
|
b553e17071 | |
|
|
e69c109aac | |
|
|
4ea5b6febc | |
|
|
35fa508360 | |
|
|
0fa31f9bb6 | |
|
|
6fceff2eb4 | |
|
|
ca624d86ab | |
|
|
70e4c0aec0 | |
|
|
9c233f11f0 | |
|
|
f675a8c926 | |
|
|
c9ba41397c | |
|
|
f5c3ce17d5 | |
|
|
1478450e61 | |
|
|
7296b9c7fa | |
|
|
9bf6c3c860 | |
|
|
2f3df42cdd | |
|
|
4e11277a19 | |
|
|
93a3f37642 | |
|
|
1be2adf7b3 | |
|
|
da738a74f5 | |
|
|
21da84303e | |
|
|
6296fd5a90 | |
|
|
c235b05d8a | |
|
|
c546b0b1bc | |
|
|
35d679a4f8 | |
|
|
6576c4da90 | |
|
|
07d6db39e5 | |
|
|
dfb8b68799 | |
|
|
23921d5a69 | |
|
|
641998f558 | |
|
|
71b1ab3784 | |
|
|
682ee99305 | |
|
|
1aba061737 | |
|
|
b938c5026c | |
|
|
df528c4f71 | |
|
|
b6b547885c | |
|
|
393fdffe20 | |
|
|
d2a26dc8e2 | |
|
|
0fbe4c4ca7 | |
|
|
c5bb7c0078 | |
|
|
447be522e9 | |
|
|
d6a417408c | |
|
|
2e5eb6e951 | |
|
|
84a6b5c039 | |
|
|
e2014d6959 | |
|
|
3a73f9cf0b | |
|
|
150cef5a5f | |
|
|
85bbc82209 | |
|
|
e7cffdbd0b | |
|
|
b13deaabae | |
|
|
239c5c86c3 | |
|
|
6429023e5f | |
|
|
2b9fb0be77 | |
|
|
5f21fdcbb9 | |
|
|
931cf2f3a8 | |
|
|
b8f57c9c50 | |
|
|
945746b40c | |
|
|
671fd1527a | |
|
|
171f037fba | |
|
|
32789b9e07 | |
|
|
a899e4bdcb | |
|
|
cbbe935765 | |
|
|
918e0ad209 | |
|
|
77c0630ce6 | |
|
|
b25d5d050b | |
|
|
57a48a4850 | |
|
|
820438ae2c | |
|
|
655c0750f5 | |
|
|
94d6d0b743 | |
|
|
07c181b57f | |
|
|
092330b474 | |
|
|
f62bb13320 | |
|
|
7fe6b8e171 | |
|
|
c6d1fbf31f | |
|
|
2a785c5969 | |
|
|
9638e29657 | |
|
|
7e57b20d53 | |
|
|
182db04cb2 | |
|
|
86d94cd95b | |
|
|
24cc89e477 | |
|
|
44d86c4921 | |
|
|
08e412c862 | |
|
|
45365fa111 | |
|
|
7024f7e5c1 | |
|
|
691b1d0826 | |
|
|
80f7be74bb | |
|
|
bfdcd4a92c | |
|
|
b732f4d9b5 | |
|
|
cdeaa34174 | |
|
|
0f99a47177 | |
|
|
d9ed371c2c | |
|
|
36b7bb3d95 | |
|
|
655072cd78 | |
|
|
b907207312 | |
|
|
c0b46c2f8f | |
|
|
e0c8e505e9 | |
|
|
34381b01c4 | |
|
|
3af7c879bc | |
|
|
28ce072f59 | |
|
|
2580cfc703 | |
|
|
3fc738a8c2 | |
|
|
458ad1d93e | |
|
|
28347201fc | |
|
|
c77a33df06 | |
|
|
bb895c843d | |
|
|
c4c6e143a7 | |
|
|
f0ee409f7b | |
|
|
4598eb080b | |
|
|
1d555510de | |
|
|
2c7472939f | |
|
|
16dd171620 | |
|
|
e70c0d43f4 | |
|
|
15deafa31e | |
|
|
fa2eaa433b | |
|
|
d91d1e8e6c | |
|
|
d1456437e1 | |
|
|
5ef7aafa06 | |
|
|
f1d2b83db0 | |
|
|
78b4fd85e1 | |
|
|
18c98ffaf7 | |
|
|
a1f76fb4cf | |
|
|
1ebf3cafa0 | |
|
|
9cbc4b3acb | |
|
|
0c2fbd4703 | |
|
|
7b19b94c5d | |
|
|
42e4a28865 | |
|
|
c031045531 | |
|
|
d6cfdc669c | |
|
|
3f5117610b | |
|
|
321f628239 | |
|
|
c5a5e6528e | |
|
|
7f6c0ac20f | |
|
|
514eabc1e5 | |
|
|
444662bc83 | |
|
|
08108512c7 | |
|
|
82bb26fba1 | |
|
|
9a40dd9365 | |
|
|
fab70d287e | |
|
|
981195be5a | |
|
|
ace95aac6b | |
|
|
1971a362dc | |
|
|
5c5b88eb77 | |
|
|
1b95f84550 | |
|
|
933bd1f79c | |
|
|
78f54d15d8 | |
|
|
21b9dd6789 | |
|
|
5ffe58838d | |
|
|
952c66237d | |
|
|
6ac5a50005 | |
|
|
75b9543856 | |
|
|
40ddc5a5b9 | |
|
|
6b67c91879 | |
|
|
9e96d390f7 | |
|
|
763a454052 | |
|
|
fc674574ca | |
|
|
166c20b473 | |
|
|
95ea8f9bfb | |
|
|
759f0084b4 | |
|
|
52699f6d19 | |
|
|
7f466e237b | |
|
|
b564a99ed6 | |
|
|
45a7083431 | |
|
|
1848f994e3 | |
|
|
07237ff99e | |
|
|
eb747f3def | |
|
|
a050c7d1bf | |
|
|
495b77aec2 | |
|
|
3987857d2d | |
|
|
f2a8e65ea7 | |
|
|
9e4e4c2401 | |
|
|
eef7422d4d | |
|
|
116a9f6ab7 | |
|
|
624be93425 | |
|
|
37c0a52c1b | |
|
|
c589dd77d4 | |
|
|
54f5c02f29 | |
|
|
a0e41ec261 | |
|
|
c976b22d7b | |
|
|
607c92430f | |
|
|
1d0f0285de | |
|
|
69f0d907ee | |
|
|
77b635e9c4 | |
|
|
5f3428219a | |
|
|
22710fdb82 | |
|
|
ca5d565dcd | |
|
|
49b505bcc5 | |
|
|
46dcb35aa3 | |
|
|
65d820a44a | |
|
|
e1cdce46c5 | |
|
|
15f6b6ad76 | |
|
|
081dc773a5 | |
|
|
551bb82960 | |
|
|
43c7c0f86c | |
|
|
fea629d00f | |
|
|
2a6de29364 | |
|
|
3d004fbf0a | |
|
|
12015a2174 | |
|
|
dfba84cb47 | |
|
|
d6a0f0d075 | |
|
|
14caedfa18 | |
|
|
61c7cd024d | |
|
|
e222814fc4 | |
|
|
16ca5e6fb1 | |
|
|
906aef3da8 | |
|
|
c890a9d9b4 | |
|
|
0ad6ceef59 | |
|
|
ab7d305b75 | |
|
|
49adc8b470 | |
|
|
6494251197 | |
|
|
9232af59ba | |
|
|
724ea71cf9 | |
|
|
dae7781052 | |
|
|
1335dfa785 | |
|
|
76684141a5 |
|
|
@ -202,6 +202,8 @@ whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
|
||||||
|
|
||||||
Note that transcription occasionally might be low accuracy when it works in parallel.
|
Note that transcription occasionally might be low accuracy when it works in parallel.
|
||||||
|
|
||||||
|
If n_processors is greater than 1, you cannot set any callbacks including new_segment_callback, progress_callback, encoder_begin_callback, abort_callback, and log_callback set by Whisper.log_set.
|
||||||
|
|
||||||
### Segments ###
|
### Segments ###
|
||||||
|
|
||||||
Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
|
Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
|
||||||
|
|
|
||||||
|
|
@ -112,6 +112,10 @@ ruby_whisper_log_callback(enum ggml_log_level level, const char * buffer, void *
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
|
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
|
||||||
|
if (NIL_P(log_callback)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
VALUE udata = rb_iv_get(mWhisper, "user_data");
|
VALUE udata = rb_iv_get(mWhisper, "user_data");
|
||||||
rb_funcall(log_callback, id_call, 3, INT2NUM(level), rb_str_new2(buffer), udata);
|
rb_funcall(log_callback, id_call, 3, INT2NUM(level), rb_str_new2(buffer), udata);
|
||||||
}
|
}
|
||||||
|
|
@ -129,10 +133,16 @@ static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_d
|
||||||
rb_iv_set(self, "log_callback", log_callback);
|
rb_iv_set(self, "log_callback", log_callback);
|
||||||
rb_iv_set(self, "user_data", user_data);
|
rb_iv_set(self, "user_data", user_data);
|
||||||
|
|
||||||
VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
|
if (!NIL_P(log_callback)) {
|
||||||
rb_define_finalizer(log_callback, finalize_log_callback);
|
VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
|
||||||
|
rb_define_finalizer(log_callback, finalize_log_callback);
|
||||||
|
}
|
||||||
|
|
||||||
whisper_log_set(ruby_whisper_log_callback, NULL);
|
if (NIL_P(log_callback)) {
|
||||||
|
whisper_log_set(NULL, NULL);
|
||||||
|
} else {
|
||||||
|
whisper_log_set(ruby_whisper_log_callback, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
return Qnil;
|
return Qnil;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
#define RUBY_WHISPER_H
|
#define RUBY_WHISPER_H
|
||||||
|
|
||||||
#include <ruby.h>
|
#include <ruby.h>
|
||||||
|
#include <ruby/util.h>
|
||||||
#include <ruby/memory_view.h>
|
#include <ruby/memory_view.h>
|
||||||
#include "whisper.h"
|
#include "whisper.h"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ extern const rb_data_type_t ruby_whisper_context_params_type;
|
||||||
extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
|
extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
|
||||||
extern VALUE rb_whisper_model_s_new(VALUE context);
|
extern VALUE rb_whisper_model_s_new(VALUE context);
|
||||||
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
|
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
|
||||||
extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
|
extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context, int n_processors);
|
||||||
|
|
||||||
ID transcribe_option_names[1];
|
ID transcribe_option_names[1];
|
||||||
|
|
||||||
|
|
@ -436,7 +436,7 @@ full_body(VALUE rb_args)
|
||||||
GetContext(*args->context, rw);
|
GetContext(*args->context, rw);
|
||||||
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||||
|
|
||||||
prepare_transcription(rwp, args->context);
|
prepare_transcription(rwp, args->context, 1);
|
||||||
int result = whisper_full(rw->context, rwp->params, args->samples, args->n_samples);
|
int result = whisper_full(rw->context, rwp->params, args->samples, args->n_samples);
|
||||||
|
|
||||||
return INT2NUM(result);
|
return INT2NUM(result);
|
||||||
|
|
@ -487,7 +487,7 @@ full_parallel_body(VALUE rb_args)
|
||||||
GetContext(*args->context, rw);
|
GetContext(*args->context, rw);
|
||||||
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||||
|
|
||||||
prepare_transcription(rwp, args->context);
|
prepare_transcription(rwp, args->context, args->n_processors);
|
||||||
int result = whisper_full_parallel(rw->context, rwp->params, args->samples, args->n_samples, args->n_processors);
|
int result = whisper_full_parallel(rw->context, rwp->params, args->samples, args->n_samples, args->n_processors);
|
||||||
|
|
||||||
return INT2NUM(result);
|
return INT2NUM(result);
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@
|
||||||
|
|
||||||
extern VALUE cParams;
|
extern VALUE cParams;
|
||||||
extern VALUE cVADParams;
|
extern VALUE cVADParams;
|
||||||
|
extern VALUE mWhisper;
|
||||||
|
|
||||||
extern ID id_call;
|
extern ID id_call;
|
||||||
|
|
||||||
|
|
@ -186,6 +187,35 @@ static bool abort_callback(void * user_data) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
check_thread_safety(ruby_whisper_params *rwp, VALUE *context, int n_processors)
|
||||||
|
{
|
||||||
|
if (n_processors == 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
|
||||||
|
rb_raise(rb_eRuntimeError, "new segment callback not supported on parallel transcription");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!NIL_P(rwp->progress_callback_container->callback) || 0 != RARRAY_LEN(rwp->progress_callback_container->callbacks)) {
|
||||||
|
rb_raise(rb_eRuntimeError, "progress callback not supported on parallel transcription");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!NIL_P(rwp->encoder_begin_callback_container->callback) || 0 != RARRAY_LEN(rwp->encoder_begin_callback_container->callbacks)) {
|
||||||
|
rb_raise(rb_eRuntimeError, "encoder begin callback not supported on parallel transcription");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!NIL_P(rwp->abort_callback_container->callback) || 0 != RARRAY_LEN(rwp->abort_callback_container->callbacks)) {
|
||||||
|
rb_raise(rb_eRuntimeError, "abort callback not supported on parallel transcription");
|
||||||
|
}
|
||||||
|
|
||||||
|
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
|
||||||
|
if (!NIL_P(log_callback)) {
|
||||||
|
rb_raise(rb_eRuntimeError, "log callback not supported for parallel transcription");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
|
static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
|
||||||
if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
|
if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
|
||||||
rwp->new_segment_callback_container->context = context;
|
rwp->new_segment_callback_container->context = context;
|
||||||
|
|
@ -219,9 +249,13 @@ static void set_vad_params(ruby_whisper_params *rwp)
|
||||||
rwp->params.vad_params = rwvp->params;
|
rwp->params.vad_params = rwvp->params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
TODO: Set abort callback to trap SIGINT and SIGTERM
|
||||||
|
*/
|
||||||
void
|
void
|
||||||
prepare_transcription(ruby_whisper_params *rwp, VALUE *context)
|
prepare_transcription(ruby_whisper_params *rwp, VALUE *context, int n_processors)
|
||||||
{
|
{
|
||||||
|
check_thread_safety(rwp, context, n_processors);
|
||||||
register_callbacks(rwp, context);
|
register_callbacks(rwp, context);
|
||||||
set_vad_params(rwp);
|
set_vad_params(rwp);
|
||||||
}
|
}
|
||||||
|
|
@ -240,6 +274,20 @@ rb_whisper_params_mark(void *p)
|
||||||
void
|
void
|
||||||
ruby_whisper_params_free(ruby_whisper_params *rwp)
|
ruby_whisper_params_free(ruby_whisper_params *rwp)
|
||||||
{
|
{
|
||||||
|
if (rwp->params.language) {
|
||||||
|
ruby_xfree((void *)rwp->params.language);
|
||||||
|
}
|
||||||
|
if (rwp->params.initial_prompt) {
|
||||||
|
ruby_xfree((void *)rwp->params.initial_prompt);
|
||||||
|
}
|
||||||
|
if (rwp->params.vad_model_path) {
|
||||||
|
ruby_xfree((void *)rwp->params.vad_model_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
xfree(rwp->new_segment_callback_container);
|
||||||
|
xfree(rwp->progress_callback_container);
|
||||||
|
xfree(rwp->encoder_begin_callback_container);
|
||||||
|
xfree(rwp->abort_callback_container);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
@ -248,7 +296,7 @@ rb_whisper_params_free(void *p)
|
||||||
ruby_whisper_params *rwp = (ruby_whisper_params *)p;
|
ruby_whisper_params *rwp = (ruby_whisper_params *)p;
|
||||||
// How to free user_data and callback only when not referred to by others?
|
// How to free user_data and callback only when not referred to by others?
|
||||||
ruby_whisper_params_free(rwp);
|
ruby_whisper_params_free(rwp);
|
||||||
free(rwp);
|
xfree(rwp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t
|
static size_t
|
||||||
|
|
@ -276,6 +324,15 @@ ruby_whisper_params_allocate(VALUE klass)
|
||||||
ruby_whisper_params *rwp;
|
ruby_whisper_params *rwp;
|
||||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||||
rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||||
|
if (rwp->params.language != NULL) {
|
||||||
|
rwp->params.language = ruby_strdup(rwp->params.language);
|
||||||
|
}
|
||||||
|
if (rwp->params.initial_prompt != NULL) {
|
||||||
|
rwp->params.initial_prompt = ruby_strdup(rwp->params.initial_prompt);
|
||||||
|
}
|
||||||
|
if (rwp->params.vad_model_path != NULL) {
|
||||||
|
rwp->params.vad_model_path = ruby_strdup(rwp->params.vad_model_path);
|
||||||
|
}
|
||||||
rwp->diarize = false;
|
rwp->diarize = false;
|
||||||
rwp->vad_params = TypedData_Wrap_Struct(cVADParams, &ruby_whisper_vad_params_type, (void *)&rwp->params.vad_params);
|
rwp->vad_params = TypedData_Wrap_Struct(cVADParams, &ruby_whisper_vad_params_type, (void *)&rwp->params.vad_params);
|
||||||
rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
|
rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
|
||||||
|
|
@ -296,10 +353,12 @@ ruby_whisper_params_set_language(VALUE self, VALUE value)
|
||||||
{
|
{
|
||||||
ruby_whisper_params *rwp;
|
ruby_whisper_params *rwp;
|
||||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||||
|
ruby_xfree((void *)rwp->params.language);
|
||||||
|
rwp->params.language = NULL;
|
||||||
if (value == Qfalse || value == Qnil) {
|
if (value == Qfalse || value == Qnil) {
|
||||||
rwp->params.language = "auto";
|
rwp->params.language = ruby_strdup("auto");
|
||||||
} else {
|
} else {
|
||||||
rwp->params.language = StringValueCStr(value);
|
rwp->params.language = ruby_strdup(StringValueCStr(value));
|
||||||
}
|
}
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
@ -608,7 +667,13 @@ ruby_whisper_params_set_initial_prompt(VALUE self, VALUE value)
|
||||||
{
|
{
|
||||||
ruby_whisper_params *rwp;
|
ruby_whisper_params *rwp;
|
||||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||||
rwp->params.initial_prompt = StringValueCStr(value);
|
ruby_xfree((void *)rwp->params.initial_prompt);
|
||||||
|
rwp->params.initial_prompt = NULL;
|
||||||
|
if (NIL_P(value)) {
|
||||||
|
rwp->params.initial_prompt = NULL;
|
||||||
|
} else {
|
||||||
|
rwp->params.initial_prompt = ruby_strdup(StringValueCStr(value));
|
||||||
|
}
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
|
|
@ -1103,12 +1168,14 @@ ruby_whisper_params_set_vad_model_path(VALUE self, VALUE value)
|
||||||
{
|
{
|
||||||
ruby_whisper_params *rwp;
|
ruby_whisper_params *rwp;
|
||||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||||
|
ruby_xfree((void *)rwp->params.vad_model_path);
|
||||||
|
rwp->params.vad_model_path = NULL;
|
||||||
if (NIL_P(value)) {
|
if (NIL_P(value)) {
|
||||||
rwp->params.vad_model_path = NULL;
|
rwp->params.vad_model_path = NULL;
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
VALUE path = ruby_whisper_normalize_model_path(value);
|
VALUE path = ruby_whisper_normalize_model_path(value);
|
||||||
rwp->params.vad_model_path = StringValueCStr(path);
|
rwp->params.vad_model_path = ruby_strdup(StringValueCStr(path));
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ extern ID id_to_path;
|
||||||
extern ID transcribe_option_names[1];
|
extern ID transcribe_option_names[1];
|
||||||
|
|
||||||
extern void
|
extern void
|
||||||
prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
|
prepare_transcription(ruby_whisper_params * rwp, VALUE * self, int n_processors);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* transcribe a single file
|
* transcribe a single file
|
||||||
|
|
@ -73,7 +73,7 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
||||||
// rwp->params.encoder_begin_callback_user_data = &is_aborted;
|
// rwp->params.encoder_begin_callback_user_data = &is_aborted;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
prepare_transcription(rwp, &self);
|
prepare_transcription(rwp, &self, n_processors);
|
||||||
|
|
||||||
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
|
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
|
||||||
fprintf(stderr, "failed to process audio\n");
|
fprintf(stderr, "failed to process audio\n");
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ module Whisper
|
||||||
def self.lang_id: (string name) -> Integer
|
def self.lang_id: (string name) -> Integer
|
||||||
def self.lang_str: (Integer id) -> String
|
def self.lang_str: (Integer id) -> String
|
||||||
def self.lang_str_full: (Integer id) -> String
|
def self.lang_str_full: (Integer id) -> String
|
||||||
def self.log_set: (log_callback, Object? user_data) -> log_callback
|
def self.log_set: (log_callback?, Object? user_data) -> log_callback
|
||||||
def self.system_info_str: () -> String
|
def self.system_info_str: () -> String
|
||||||
|
|
||||||
class Context
|
class Context
|
||||||
|
|
@ -52,6 +52,9 @@ module Whisper
|
||||||
# puts text
|
# puts text
|
||||||
# end
|
# end
|
||||||
#
|
#
|
||||||
|
# If n_processors is greater than 1, you cannot set any callbacks including
|
||||||
|
# new_segment_callback, progress_callback, encoder_begin_callback, abort_callback,
|
||||||
|
# and log_callback set by Whisper.log_set
|
||||||
def transcribe: (path, Params, ?n_processors: Integer) -> self
|
def transcribe: (path, Params, ?n_processors: Integer) -> self
|
||||||
| (path, Params, ?n_processors: Integer) { (String) -> void } -> self
|
| (path, Params, ?n_processors: Integer) { (String) -> void } -> self
|
||||||
|
|
||||||
|
|
@ -129,6 +132,9 @@ module Whisper
|
||||||
# It seems this approach can offer some speedup in some cases.
|
# It seems this approach can offer some speedup in some cases.
|
||||||
# However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
# However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||||
#
|
#
|
||||||
|
# If n_processors is greater than 1, you cannot set any callbacks including
|
||||||
|
# new_segment_callback, progress_callback, encoder_begin_callback, abort_callback,
|
||||||
|
# and log_callback set by Whisper.log_set
|
||||||
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
|
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
|
||||||
| (Params, _Samples, ?Integer n_samples) -> self
|
| (Params, _Samples, ?Integer n_samples) -> self
|
||||||
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
|
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,8 @@ class TestParams < TestBase
|
||||||
def test_language
|
def test_language
|
||||||
@params.language = "en"
|
@params.language = "en"
|
||||||
assert_equal @params.language, "en"
|
assert_equal @params.language, "en"
|
||||||
|
GC.compact
|
||||||
|
assert_equal @params.language, "en"
|
||||||
@params.language = "auto"
|
@params.language = "auto"
|
||||||
assert_equal @params.language, "auto"
|
assert_equal @params.language, "auto"
|
||||||
end
|
end
|
||||||
|
|
|
||||||
|
|
@ -43,9 +43,20 @@ class TestWhisper < TestBase
|
||||||
@whisper = Whisper::Context.new("base.en")
|
@whisper = Whisper::Context.new("base.en")
|
||||||
params = Whisper::Params.new
|
params = Whisper::Params.new
|
||||||
|
|
||||||
@whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
|
without_log_callback do
|
||||||
assert_match(/what you can do for your country/i, text)
|
@whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
|
||||||
}
|
assert_match(/what you can do for your country/i, text)
|
||||||
|
}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def without_log_callback
|
||||||
|
Whisper.log_set nil, nil
|
||||||
|
yield
|
||||||
|
ensure
|
||||||
|
Whisper.log_set ->(level, buffer, user_data) {}, nil
|
||||||
end
|
end
|
||||||
|
|
||||||
sub_test_case "After transcription" do
|
sub_test_case "After transcription" do
|
||||||
|
|
@ -229,7 +240,9 @@ class TestWhisper < TestBase
|
||||||
|
|
||||||
def test_full_parallel
|
def test_full_parallel
|
||||||
nprocessors = 2
|
nprocessors = 2
|
||||||
@whisper.full_parallel(@params, @samples, @samples.length, nprocessors)
|
without_log_callback do
|
||||||
|
@whisper.full_parallel(@params, @samples, @samples.length, nprocessors)
|
||||||
|
end
|
||||||
|
|
||||||
assert_equal nprocessors, @whisper.full_n_segments
|
assert_equal nprocessors, @whisper.full_n_segments
|
||||||
text = @whisper.each_segment.collect(&:text).join
|
text = @whisper.each_segment.collect(&:text).join
|
||||||
|
|
@ -240,7 +253,9 @@ class TestWhisper < TestBase
|
||||||
def test_full_parallel_with_memory_view
|
def test_full_parallel_with_memory_view
|
||||||
nprocessors = 2
|
nprocessors = 2
|
||||||
samples = JFKReader.new(AUDIO)
|
samples = JFKReader.new(AUDIO)
|
||||||
@whisper.full_parallel(@params, samples, nil, nprocessors)
|
without_log_callback do
|
||||||
|
@whisper.full_parallel(@params, samples, nil, nprocessors)
|
||||||
|
end
|
||||||
|
|
||||||
assert_equal nprocessors, @whisper.full_n_segments
|
assert_equal nprocessors, @whisper.full_n_segments
|
||||||
text = @whisper.each_segment.collect(&:text).join
|
text = @whisper.each_segment.collect(&:text).join
|
||||||
|
|
@ -259,7 +274,9 @@ class TestWhisper < TestBase
|
||||||
|
|
||||||
def test_full_parallel_without_length
|
def test_full_parallel_without_length
|
||||||
nprocessors = 2
|
nprocessors = 2
|
||||||
@whisper.full_parallel(@params, @samples, nil, nprocessors)
|
without_log_callback do
|
||||||
|
@whisper.full_parallel(@params, @samples, nil, nprocessors)
|
||||||
|
end
|
||||||
|
|
||||||
assert_equal nprocessors, @whisper.full_n_segments
|
assert_equal nprocessors, @whisper.full_n_segments
|
||||||
text = @whisper.each_segment.collect(&:text).join
|
text = @whisper.each_segment.collect(&:text).join
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ require_relative "extsources"
|
||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "whispercpp"
|
s.name = "whispercpp"
|
||||||
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
||||||
s.version = '1.3.6'
|
s.version = '1.3.7'
|
||||||
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
|
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
|
||||||
s.email = 'todd.fisher@gmail.com'
|
s.email = 'todd.fisher@gmail.com'
|
||||||
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ void bench_main(size_t index) {
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "If you wish, you can submit these results here:\n");
|
fprintf(stderr, "If you wish, you can submit these results here:\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, " https://github.com/ggerganov/whisper.cpp/issues/89\n");
|
fprintf(stderr, " https://github.com/ggml-org/whisper.cpp/issues/89\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "Please include the following information:\n");
|
fprintf(stderr, "Please include the following information:\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
|
||||||
|
|
@ -157,7 +157,7 @@ static int whisper_bench_full(const whisper_params & params) {
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "If you wish, you can submit these results here:\n");
|
fprintf(stderr, "If you wish, you can submit these results here:\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, " https://github.com/ggerganov/whisper.cpp/issues/89\n");
|
fprintf(stderr, " https://github.com/ggml-org/whisper.cpp/issues/89\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "Please include the following information:\n");
|
fprintf(stderr, "Please include the following information:\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,7 @@ bool ggml_common_quantize_0(
|
||||||
case GGML_FTYPE_MOSTLY_BF16:
|
case GGML_FTYPE_MOSTLY_BF16:
|
||||||
case GGML_FTYPE_MOSTLY_MXFP4:
|
case GGML_FTYPE_MOSTLY_MXFP4:
|
||||||
case GGML_FTYPE_MOSTLY_NVFP4:
|
case GGML_FTYPE_MOSTLY_NVFP4:
|
||||||
|
case GGML_FTYPE_MOSTLY_Q1_0:
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -215,6 +216,7 @@ bool ggml_common_quantize_0(
|
||||||
case GGML_TYPE_TQ2_0:
|
case GGML_TYPE_TQ2_0:
|
||||||
case GGML_TYPE_MXFP4:
|
case GGML_TYPE_MXFP4:
|
||||||
case GGML_TYPE_NVFP4:
|
case GGML_TYPE_NVFP4:
|
||||||
|
case GGML_TYPE_Q1_0:
|
||||||
case GGML_TYPE_COUNT:
|
case GGML_TYPE_COUNT:
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
||||||
|
|
|
||||||
|
|
@ -294,7 +294,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||||
}
|
}
|
||||||
|
|
||||||
// get extra buffer types of the CPU
|
// get extra buffer types of the CPU
|
||||||
// TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
|
// TODO: a more general solution for non-CPU extra buft should be implemented in the future
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
|
// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
|
||||||
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
||||||
{
|
{
|
||||||
|
|
@ -418,7 +418,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
||||||
llama_adapter_lora * adapter = new llama_adapter_lora();
|
llama_adapter_lora * adapter = new llama_adapter_lora(model);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||||
|
|
@ -471,8 +471,17 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
||||||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_adapter_lora_free(llama_adapter_lora *) {
|
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
||||||
// deprecated: adapters are freed by llama_model's destructor
|
if (adapter == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (adapter->model != nullptr) {
|
||||||
|
adapter->model->loras.erase(adapter);
|
||||||
|
adapter->model = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete adapter;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
||||||
|
|
|
||||||
|
|
@ -61,6 +61,8 @@ struct llama_adapter_lora_weight {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_adapter_lora {
|
struct llama_adapter_lora {
|
||||||
|
llama_model * model = nullptr;
|
||||||
|
|
||||||
// map tensor name to lora_a_b
|
// map tensor name to lora_a_b
|
||||||
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
||||||
|
|
||||||
|
|
@ -75,7 +77,7 @@ struct llama_adapter_lora {
|
||||||
// activated lora (aLoRA)
|
// activated lora (aLoRA)
|
||||||
std::vector<llama_token> alora_invocation_tokens;
|
std::vector<llama_token> alora_invocation_tokens;
|
||||||
|
|
||||||
llama_adapter_lora() = default;
|
explicit llama_adapter_lora(llama_model * model) : model(model) {}
|
||||||
~llama_adapter_lora() = default;
|
~llama_adapter_lora() = default;
|
||||||
|
|
||||||
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -60,6 +60,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_GEMMA2,
|
LLM_ARCH_GEMMA2,
|
||||||
LLM_ARCH_GEMMA3,
|
LLM_ARCH_GEMMA3,
|
||||||
LLM_ARCH_GEMMA3N,
|
LLM_ARCH_GEMMA3N,
|
||||||
|
LLM_ARCH_GEMMA4,
|
||||||
LLM_ARCH_GEMMA_EMBEDDING,
|
LLM_ARCH_GEMMA_EMBEDDING,
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
LLM_ARCH_MAMBA,
|
LLM_ARCH_MAMBA,
|
||||||
|
|
@ -77,6 +78,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_ARCTIC,
|
LLM_ARCH_ARCTIC,
|
||||||
LLM_ARCH_DEEPSEEK,
|
LLM_ARCH_DEEPSEEK,
|
||||||
LLM_ARCH_DEEPSEEK2,
|
LLM_ARCH_DEEPSEEK2,
|
||||||
|
LLM_ARCH_DEEPSEEK2OCR,
|
||||||
LLM_ARCH_CHATGLM,
|
LLM_ARCH_CHATGLM,
|
||||||
LLM_ARCH_GLM4,
|
LLM_ARCH_GLM4,
|
||||||
LLM_ARCH_GLM4_MOE,
|
LLM_ARCH_GLM4_MOE,
|
||||||
|
|
@ -111,6 +113,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_ERNIE4_5_MOE,
|
LLM_ARCH_ERNIE4_5_MOE,
|
||||||
LLM_ARCH_HUNYUAN_MOE,
|
LLM_ARCH_HUNYUAN_MOE,
|
||||||
LLM_ARCH_HUNYUAN_DENSE,
|
LLM_ARCH_HUNYUAN_DENSE,
|
||||||
|
LLM_ARCH_HUNYUAN_VL,
|
||||||
LLM_ARCH_SMOLLM3,
|
LLM_ARCH_SMOLLM3,
|
||||||
LLM_ARCH_OPENAI_MOE,
|
LLM_ARCH_OPENAI_MOE,
|
||||||
LLM_ARCH_LFM2,
|
LLM_ARCH_LFM2,
|
||||||
|
|
@ -127,6 +130,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_RND1,
|
LLM_ARCH_RND1,
|
||||||
LLM_ARCH_PANGU_EMBED,
|
LLM_ARCH_PANGU_EMBED,
|
||||||
LLM_ARCH_MISTRAL3,
|
LLM_ARCH_MISTRAL3,
|
||||||
|
LLM_ARCH_MISTRAL4,
|
||||||
LLM_ARCH_PADDLEOCR,
|
LLM_ARCH_PADDLEOCR,
|
||||||
LLM_ARCH_MIMO2,
|
LLM_ARCH_MIMO2,
|
||||||
LLM_ARCH_STEP35,
|
LLM_ARCH_STEP35,
|
||||||
|
|
@ -167,6 +171,7 @@ enum llm_kv {
|
||||||
LLM_KV_CONTEXT_LENGTH,
|
LLM_KV_CONTEXT_LENGTH,
|
||||||
LLM_KV_EMBEDDING_LENGTH,
|
LLM_KV_EMBEDDING_LENGTH,
|
||||||
LLM_KV_EMBEDDING_LENGTH_OUT,
|
LLM_KV_EMBEDDING_LENGTH_OUT,
|
||||||
|
LLM_KV_EMBEDDING_LENGTH_PER_LAYER,
|
||||||
LLM_KV_FEATURES_LENGTH,
|
LLM_KV_FEATURES_LENGTH,
|
||||||
LLM_KV_BLOCK_COUNT,
|
LLM_KV_BLOCK_COUNT,
|
||||||
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
||||||
|
|
@ -240,6 +245,7 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
|
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
|
||||||
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
||||||
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
||||||
|
LLM_KV_ATTENTION_SHARED_KV_LAYERS,
|
||||||
|
|
||||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||||
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
|
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
|
||||||
|
|
@ -249,6 +255,7 @@ enum llm_kv {
|
||||||
LLM_KV_ROPE_SCALE_LINEAR,
|
LLM_KV_ROPE_SCALE_LINEAR,
|
||||||
LLM_KV_ROPE_SCALING_TYPE,
|
LLM_KV_ROPE_SCALING_TYPE,
|
||||||
LLM_KV_ROPE_SCALING_FACTOR,
|
LLM_KV_ROPE_SCALING_FACTOR,
|
||||||
|
LLM_KV_ROPE_SCALING_ALPHA,
|
||||||
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
||||||
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
||||||
LLM_KV_ROPE_SCALING_FINETUNED,
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
||||||
|
|
@ -367,6 +374,9 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||||
LLM_TENSOR_FFN_NORM,
|
LLM_TENSOR_FFN_NORM,
|
||||||
LLM_TENSOR_FFN_POST_NORM,
|
LLM_TENSOR_FFN_POST_NORM,
|
||||||
|
LLM_TENSOR_FFN_POST_NORM_1,
|
||||||
|
LLM_TENSOR_FFN_POST_NORM_2,
|
||||||
|
LLM_TENSOR_FFN_PRE_NORM_2,
|
||||||
LLM_TENSOR_FFN_GATE,
|
LLM_TENSOR_FFN_GATE,
|
||||||
LLM_TENSOR_FFN_DOWN,
|
LLM_TENSOR_FFN_DOWN,
|
||||||
LLM_TENSOR_FFN_UP,
|
LLM_TENSOR_FFN_UP,
|
||||||
|
|
@ -391,6 +401,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
LLM_TENSOR_LAYER_OUT_NORM,
|
LLM_TENSOR_LAYER_OUT_NORM,
|
||||||
|
LLM_TENSOR_LAYER_OUT_SCALE,
|
||||||
LLM_TENSOR_POST_ATTN_NORM,
|
LLM_TENSOR_POST_ATTN_NORM,
|
||||||
LLM_TENSOR_POST_MLP_NORM,
|
LLM_TENSOR_POST_MLP_NORM,
|
||||||
LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
|
LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
|
||||||
|
|
@ -576,8 +587,6 @@ struct LLM_TN_IMPL {
|
||||||
const int bid;
|
const int bid;
|
||||||
const int xid;
|
const int xid;
|
||||||
|
|
||||||
const std::set<llm_tensor> model_tensors;
|
|
||||||
|
|
||||||
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
|
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
|
||||||
|
|
||||||
std::string str() const;
|
std::string str() const;
|
||||||
|
|
@ -623,6 +632,7 @@ llm_arch llm_arch_from_string(const std::string & name);
|
||||||
|
|
||||||
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
|
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
|
||||||
|
|
||||||
bool llm_arch_is_recurrent(const llm_arch & arch);
|
bool llm_arch_is_recurrent (const llm_arch & arch);
|
||||||
bool llm_arch_is_hybrid (const llm_arch & arch);
|
bool llm_arch_is_hybrid (const llm_arch & arch);
|
||||||
bool llm_arch_is_diffusion(const llm_arch & arch);
|
bool llm_arch_is_diffusion (const llm_arch & arch);
|
||||||
|
bool llm_arch_supports_sm_tensor(const llm_arch & arch);
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ struct llama_ubatch {
|
||||||
}
|
}
|
||||||
|
|
||||||
// typical for M-RoPE cases:
|
// typical for M-RoPE cases:
|
||||||
// 0 - sequantial position of the tokens/embeddings in the sequence
|
// 0 - sequential position of the tokens/embeddings in the sequence
|
||||||
// 1 - y position in the image
|
// 1 - y position in the image
|
||||||
// 2 - x position in the image
|
// 2 - x position in the image
|
||||||
// 3 - other
|
// 3 - other
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
||||||
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
||||||
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
||||||
|
{ "deepseek-ocr", LLM_CHAT_TEMPLATE_DEEPSEEK_OCR },
|
||||||
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
||||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
||||||
|
|
@ -59,7 +60,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
|
{ "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
|
||||||
{ "exaone-moe", LLM_CHAT_TEMPLATE_EXAONE_MOE },
|
{ "exaone-moe", LLM_CHAT_TEMPLATE_EXAONE_MOE },
|
||||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
{ "granite", LLM_CHAT_TEMPLATE_GRANITE_3_X },
|
||||||
|
{ "granite-4.0", LLM_CHAT_TEMPLATE_GRANITE_4_0 },
|
||||||
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
||||||
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
||||||
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
||||||
|
|
@ -71,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
||||||
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
||||||
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
||||||
|
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
|
||||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||||
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
||||||
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
||||||
|
|
@ -190,7 +193,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||||
} else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) {
|
} else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) {
|
||||||
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
||||||
} else if (tmpl_contains("<|start_of_role|>")) {
|
} else if (tmpl_contains("<|start_of_role|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_GRANITE;
|
if (tmpl_contains("<tool_call>") || tmpl_contains("<tools>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_GRANITE_4_0;
|
||||||
|
}
|
||||||
|
return LLM_CHAT_TEMPLATE_GRANITE_3_X;
|
||||||
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
||||||
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
||||||
} else if (tmpl_contains("<|role_start|>")) {
|
} else if (tmpl_contains("<|role_start|>")) {
|
||||||
|
|
@ -211,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||||
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
||||||
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
||||||
|
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
|
||||||
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
||||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||||
|
|
@ -548,6 +556,11 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << LU8("<|Assistant|>");
|
ss << LU8("<|Assistant|>");
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_OCR) {
|
||||||
|
for (auto message : chat) {
|
||||||
|
// no template
|
||||||
|
ss << message->content;
|
||||||
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
||||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||||
// EXAONE-3.0-7.8B-Instruct
|
// EXAONE-3.0-7.8B-Instruct
|
||||||
|
|
@ -611,8 +624,8 @@ int32_t llm_chat_apply_template(
|
||||||
ss << "Assistant: " << trim(chat[i]->content) << "\n\n";
|
ss << "Assistant: " << trim(chat[i]->content) << "\n\n";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE_3_X) {
|
||||||
// IBM Granite template
|
// IBM Granite 3.x template
|
||||||
for (const auto & message : chat) {
|
for (const auto & message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
ss << "<|start_of_role|>" << role << "<|end_of_role|>";
|
ss << "<|start_of_role|>" << role << "<|end_of_role|>";
|
||||||
|
|
@ -624,6 +637,20 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|start_of_role|>assistant<|end_of_role|>";
|
ss << "<|start_of_role|>assistant<|end_of_role|>";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE_4_0) {
|
||||||
|
// IBM Granite 4.0 template
|
||||||
|
for (const auto & message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
if (role == "assistant_tool_call") {
|
||||||
|
ss << "<|start_of_role|>assistant<|end_of_role|><|tool_call|>";
|
||||||
|
} else {
|
||||||
|
ss << "<|start_of_role|>" << role << "<|end_of_role|>";
|
||||||
|
}
|
||||||
|
ss << message->content << "<|end_of_text|>\n";
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|start_of_role|>assistant<|end_of_role|>";
|
||||||
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
||||||
// GigaChat template
|
// GigaChat template
|
||||||
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
||||||
|
|
@ -798,6 +825,22 @@ int32_t llm_chat_apply_template(
|
||||||
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
|
||||||
|
// tencent/HunyuanOCR
|
||||||
|
ss << "<|hy_begin▁of▁sentence|>";
|
||||||
|
for (size_t i = 0; i < chat.size(); i++) {
|
||||||
|
std::string role(chat[i]->role);
|
||||||
|
if (i == 0 && role == "system") {
|
||||||
|
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (role == "user") {
|
||||||
|
ss << chat[i]->content << "<|hy_User|>";
|
||||||
|
} else if (role == "assistant") {
|
||||||
|
ss << chat[i]->content << "<|hy_Assistant|>";
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
||||||
// moonshotai/Kimi-K2-Instruct
|
// moonshotai/Kimi-K2-Instruct
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
||||||
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
||||||
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
||||||
|
LLM_CHAT_TEMPLATE_DEEPSEEK_OCR,
|
||||||
LLM_CHAT_TEMPLATE_COMMAND_R,
|
LLM_CHAT_TEMPLATE_COMMAND_R,
|
||||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||||
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
||||||
|
|
@ -38,7 +39,8 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_EXAONE_4,
|
LLM_CHAT_TEMPLATE_EXAONE_4,
|
||||||
LLM_CHAT_TEMPLATE_EXAONE_MOE,
|
LLM_CHAT_TEMPLATE_EXAONE_MOE,
|
||||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||||
LLM_CHAT_TEMPLATE_GRANITE,
|
LLM_CHAT_TEMPLATE_GRANITE_3_X,
|
||||||
|
LLM_CHAT_TEMPLATE_GRANITE_4_0,
|
||||||
LLM_CHAT_TEMPLATE_GIGACHAT,
|
LLM_CHAT_TEMPLATE_GIGACHAT,
|
||||||
LLM_CHAT_TEMPLATE_MEGREZ,
|
LLM_CHAT_TEMPLATE_MEGREZ,
|
||||||
LLM_CHAT_TEMPLATE_YANDEX,
|
LLM_CHAT_TEMPLATE_YANDEX,
|
||||||
|
|
@ -51,6 +53,7 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
||||||
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
||||||
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
||||||
|
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
|
||||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||||
LLM_CHAT_TEMPLATE_SEED_OSS,
|
LLM_CHAT_TEMPLATE_SEED_OSS,
|
||||||
LLM_CHAT_TEMPLATE_GROK_2,
|
LLM_CHAT_TEMPLATE_GROK_2,
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
#include "llama-context.h"
|
#include "llama-context.h"
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
#include "llama-arch.h"
|
#include "llama-arch.h"
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
#include "llama-batch.h"
|
#include "llama-batch.h"
|
||||||
|
|
@ -8,6 +9,7 @@
|
||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
#include "llama-ext.h"
|
#include "llama-ext.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
@ -217,10 +219,10 @@ llama_context::llama_context(
|
||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
// GPU backends
|
// GPU backends
|
||||||
for (auto * dev : model.devices) {
|
for (const auto & dev : model.devices) {
|
||||||
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
ggml_backend_t backend = ggml_backend_dev_init(dev.dev, nullptr);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
|
throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev.dev)));
|
||||||
}
|
}
|
||||||
backends.emplace_back(backend);
|
backends.emplace_back(backend);
|
||||||
}
|
}
|
||||||
|
|
@ -295,8 +297,8 @@ llama_context::llama_context(
|
||||||
|
|
||||||
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
|
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
|
||||||
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
||||||
auto * dev = model.devices[0];
|
const auto & dev = model.devices[0];
|
||||||
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
auto * host_buft = ggml_backend_dev_host_buffer_type(dev.dev);
|
||||||
if (host_buft) {
|
if (host_buft) {
|
||||||
buft = host_buft;
|
buft = host_buft;
|
||||||
}
|
}
|
||||||
|
|
@ -342,14 +344,6 @@ llama_context::llama_context(
|
||||||
|
|
||||||
if (cparams.pipeline_parallel) {
|
if (cparams.pipeline_parallel) {
|
||||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
|
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
|
||||||
|
|
||||||
if (!graph_reuse_disable) {
|
|
||||||
// TODO: figure out a way to make graph reuse work with pipeline parallelism
|
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/20463
|
|
||||||
LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
|
|
||||||
|
|
||||||
graph_reuse_disable = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sched_reserve();
|
sched_reserve();
|
||||||
|
|
@ -594,7 +588,7 @@ void llama_context::sched_reserve() {
|
||||||
|
|
||||||
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
||||||
{
|
{
|
||||||
// TODO: not sure if the following graph would be worster case for multi-stream KV caches:
|
// TODO: not sure if the following graph would be worst case for multi-stream KV caches:
|
||||||
//
|
//
|
||||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||||
//
|
//
|
||||||
|
|
@ -1028,9 +1022,11 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
|
||||||
|
|
||||||
for (auto & backend : backends) {
|
for (auto & backend : backends) {
|
||||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
|
||||||
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
if (reg) {
|
||||||
if (set_abort_callback_fn) {
|
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
||||||
set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
|
if (set_abort_callback_fn) {
|
||||||
|
set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1165,9 +1161,11 @@ bool llama_context::set_adapter_cvec(
|
||||||
int32_t il_end) {
|
int32_t il_end) {
|
||||||
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
|
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
|
||||||
|
|
||||||
// TODO: should we reserve?
|
bool res = cvec->apply(model, data, len, n_embd, il_start, il_end);
|
||||||
|
|
||||||
return cvec->apply(model, data, len, n_embd, il_start, il_end);
|
sched_need_reserve = true;
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||||
|
|
@ -1187,6 +1185,13 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
||||||
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
||||||
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
||||||
|
|
||||||
|
// with pipeline parallelism, the previous graph_compute_async may still be running
|
||||||
|
// on the GPU. we must synchronize before set_inputs to avoid overwriting input tensors
|
||||||
|
// that the previous compute is still reading.
|
||||||
|
if (cparams.pipeline_parallel) {
|
||||||
|
ggml_backend_sched_synchronize(sched.get());
|
||||||
|
}
|
||||||
|
|
||||||
n_reused++;
|
n_reused++;
|
||||||
} else {
|
} else {
|
||||||
res->reset();
|
res->reset();
|
||||||
|
|
@ -1345,8 +1350,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||||
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
|
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
|
||||||
const int32_t seq_idx = ubatch.seq_idx[seq_id];
|
const int32_t seq_idx = ubatch.seq_idx[seq_id];
|
||||||
|
|
||||||
embd_seq_out[seq_id].resize(n_embd);
|
// use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
|
||||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
|
// output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
|
||||||
|
const uint32_t n_embd_out = hparams.n_embd_out();
|
||||||
|
embd_seq_out[seq_id].resize(n_embd_out);
|
||||||
|
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_POOLING_TYPE_RANK:
|
case LLAMA_POOLING_TYPE_RANK:
|
||||||
|
|
@ -1767,12 +1775,16 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
// extract sequence embeddings (cleared before processing each batch)
|
// extract sequence embeddings (cleared before processing each batch)
|
||||||
auto & embd_seq_out = embd_seq;
|
auto & embd_seq_out = embd_seq;
|
||||||
|
|
||||||
|
// use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
|
||||||
|
// output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
|
||||||
|
const uint32_t n_embd_out = hparams.n_embd_out();
|
||||||
|
|
||||||
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
||||||
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
|
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
|
||||||
const int32_t seq_idx = ubatch.seq_idx[seq_id];
|
const int32_t seq_idx = ubatch.seq_idx[seq_id];
|
||||||
|
|
||||||
embd_seq_out[seq_id].resize(n_embd);
|
embd_seq_out[seq_id].resize(n_embd_out);
|
||||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
|
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_POOLING_TYPE_RANK:
|
case LLAMA_POOLING_TYPE_RANK:
|
||||||
|
|
@ -1944,6 +1956,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
ggml_backend_buffer_clear(buf_output.get(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
|
float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
|
||||||
|
|
@ -2623,7 +2636,7 @@ void llama_context::perf_reset() {
|
||||||
n_reused = 0;
|
n_reused = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
llama_memory_breakdown llama_context::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||||
for (const auto & [buft, size] : model.memory_breakdown()) {
|
for (const auto & [buft, size] : model.memory_breakdown()) {
|
||||||
ret[buft].model += size;
|
ret[buft].model += size;
|
||||||
|
|
@ -2933,7 +2946,22 @@ llama_context * llama_init_from_model(
|
||||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
|
if (model->split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
|
||||||
|
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
|
||||||
|
LLAMA_LOG_INFO("%s: enabling flash_attn since it is required for SPLIT_MODE_TENSOR\n", __func__);
|
||||||
|
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
||||||
|
}
|
||||||
|
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_ENABLED) {
|
||||||
|
LLAMA_LOG_ERROR("%s: SPLIT_MODE_TENSOR requires flash_attn to be enabled\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
if (ggml_is_quantized(params.type_k) || ggml_is_quantized(params.type_v)) {
|
||||||
|
LLAMA_LOG_ERROR("%s: simultaneous use of SPLIT_MODE_TENSOR and KV cache quantization not implemented\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
|
||||||
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
||||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||||
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
|
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
|
||||||
|
|
@ -2944,7 +2972,7 @@ llama_context * llama_init_from_model(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
|
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
|
||||||
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
||||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||||
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
|
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
|
||||||
|
|
@ -3465,142 +3493,6 @@ void llama_perf_context_reset(llama_context * ctx) {
|
||||||
ctx->perf_reset();
|
ctx->perf_reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
|
||||||
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
|
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
|
||||||
|
|
||||||
std::vector<std::array<std::string, 9>> table_data;
|
|
||||||
table_data.reserve(devices.size());
|
|
||||||
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
|
||||||
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
|
||||||
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
|
||||||
|
|
||||||
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
|
||||||
|
|
||||||
constexpr size_t MiB = 1024 * 1024;
|
|
||||||
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
|
||||||
|
|
||||||
// track seen buffer types to avoid double counting:
|
|
||||||
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
|
||||||
|
|
||||||
// accumulative memory breakdown for each device and for host:
|
|
||||||
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
|
||||||
llama_memory_breakdown_data mb_host;
|
|
||||||
|
|
||||||
for (const auto & buft_mb : memory_breakdown) {
|
|
||||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
|
||||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
|
||||||
if (ggml_backend_buft_is_host(buft)) {
|
|
||||||
mb_host.model += mb.model;
|
|
||||||
mb_host.context += mb.context;
|
|
||||||
mb_host.compute += mb.compute;
|
|
||||||
seen_buffer_types.insert(buft);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
|
||||||
if (dev) {
|
|
||||||
int i_dev = -1;
|
|
||||||
for (size_t i = 0; i < devices.size(); i++) {
|
|
||||||
if (devices[i] == dev) {
|
|
||||||
i_dev = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (i_dev != -1) {
|
|
||||||
mb_dev[i_dev].model += mb.model;
|
|
||||||
mb_dev[i_dev].context += mb.context;
|
|
||||||
mb_dev[i_dev].compute += mb.compute;
|
|
||||||
seen_buffer_types.insert(buft);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// print memory breakdown for each device:
|
|
||||||
for (size_t i = 0; i < devices.size(); i++) {
|
|
||||||
ggml_backend_dev_t dev = devices[i];
|
|
||||||
llama_memory_breakdown_data mb = mb_dev[i];
|
|
||||||
|
|
||||||
const std::string name = ggml_backend_dev_name(dev);
|
|
||||||
std::string desc = ggml_backend_dev_description(dev);
|
|
||||||
for (const std::string & prefix : desc_prefixes_strip) {
|
|
||||||
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
|
||||||
desc = desc.substr(prefix.length());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t free, total;
|
|
||||||
ggml_backend_dev_memory(dev, &free, &total);
|
|
||||||
|
|
||||||
const size_t self = mb.model + mb.context + mb.compute;
|
|
||||||
const size_t unaccounted = total - self - free;
|
|
||||||
|
|
||||||
table_data.push_back({
|
|
||||||
template_gpu,
|
|
||||||
" - " + name + " (" + desc + ")",
|
|
||||||
std::to_string(total / MiB),
|
|
||||||
std::to_string(free / MiB),
|
|
||||||
std::to_string(self / MiB),
|
|
||||||
std::to_string(mb.model / MiB),
|
|
||||||
std::to_string(mb.context / MiB),
|
|
||||||
std::to_string(mb.compute / MiB),
|
|
||||||
std::to_string(unaccounted / MiB)});
|
|
||||||
}
|
|
||||||
|
|
||||||
// print memory breakdown for host:
|
|
||||||
{
|
|
||||||
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
|
||||||
table_data.push_back({
|
|
||||||
template_other,
|
|
||||||
" - Host",
|
|
||||||
"", // total
|
|
||||||
"", // free
|
|
||||||
std::to_string(self / MiB),
|
|
||||||
std::to_string(mb_host.model / MiB),
|
|
||||||
std::to_string(mb_host.context / MiB),
|
|
||||||
std::to_string(mb_host.compute / MiB),
|
|
||||||
""}); // unaccounted
|
|
||||||
}
|
|
||||||
|
|
||||||
// print memory breakdown for all remaining buffer types:
|
|
||||||
for (const auto & buft_mb : memory_breakdown) {
|
|
||||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
|
||||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
|
||||||
if (seen_buffer_types.count(buft) == 1) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const std::string name = ggml_backend_buft_name(buft);
|
|
||||||
const size_t self = mb.model + mb.context + mb.compute;
|
|
||||||
table_data.push_back({
|
|
||||||
template_other,
|
|
||||||
" - " + name,
|
|
||||||
"", // total
|
|
||||||
"", // free
|
|
||||||
std::to_string(self / MiB),
|
|
||||||
std::to_string(mb.model / MiB),
|
|
||||||
std::to_string(mb.context / MiB),
|
|
||||||
std::to_string(mb.compute / MiB),
|
|
||||||
""}); // unaccounted
|
|
||||||
seen_buffer_types.insert(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t j = 1; j < table_data[0].size(); j++) {
|
|
||||||
size_t max_len = 0;
|
|
||||||
for (const auto & td : table_data) {
|
|
||||||
max_len = std::max(max_len, td[j].length());
|
|
||||||
}
|
|
||||||
for (auto & td : table_data) {
|
|
||||||
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (const auto & td : table_data) {
|
|
||||||
LLAMA_LOG_INFO(td[0].c_str(),
|
|
||||||
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
|
||||||
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// training
|
// training
|
||||||
//
|
//
|
||||||
|
|
@ -3631,3 +3523,11 @@ void llama_opt_epoch(
|
||||||
callback_train,
|
callback_train,
|
||||||
callback_eval);
|
callback_eval);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// ext
|
||||||
|
//
|
||||||
|
|
||||||
|
llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) {
|
||||||
|
return ctx->memory_breakdown();
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "llama-ext.h"
|
||||||
#include "llama-cparams.h"
|
#include "llama-cparams.h"
|
||||||
#include "llama-graph.h"
|
#include "llama-graph.h"
|
||||||
#include "llama-adapter.h"
|
#include "llama-adapter.h"
|
||||||
|
|
@ -22,17 +23,6 @@ class llama_io_write_i;
|
||||||
struct llama_memory_i;
|
struct llama_memory_i;
|
||||||
struct llama_memory_context_i;
|
struct llama_memory_context_i;
|
||||||
|
|
||||||
// "memory" as in physical memory for a buffer type, in bytes
|
|
||||||
struct llama_memory_breakdown_data {
|
|
||||||
size_t model = 0; // memory allocated for the model
|
|
||||||
size_t context = 0; // memory allocated for the context
|
|
||||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
|
||||||
|
|
||||||
size_t total() const {
|
|
||||||
return model + context + compute;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
// init scheduler and compute buffers, reserve worst-case graphs
|
// init scheduler and compute buffers, reserve worst-case graphs
|
||||||
llama_context(
|
llama_context(
|
||||||
|
|
@ -172,7 +162,7 @@ struct llama_context {
|
||||||
llama_perf_context_data perf_get_data() const;
|
llama_perf_context_data perf_get_data() const;
|
||||||
void perf_reset();
|
void perf_reset();
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
llama_memory_breakdown memory_breakdown() const;
|
||||||
|
|
||||||
//
|
//
|
||||||
// training
|
// training
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,12 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama-context.h"
|
// this is a staging header for new llama.cpp API
|
||||||
#include "ggml.h"
|
// breaking changes and C++ are allowed. everything here should be considered WIP
|
||||||
#include "stdint.h"
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
|
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
|
||||||
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||||
|
|
@ -10,3 +14,77 @@ LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||||
uint32_t n_tokens,
|
uint32_t n_tokens,
|
||||||
uint32_t n_seqs,
|
uint32_t n_seqs,
|
||||||
uint32_t n_outputs);
|
uint32_t n_outputs);
|
||||||
|
|
||||||
|
// Get the default ggml_type for a given ftype.
|
||||||
|
LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
|
||||||
|
|
||||||
|
struct quantize_state_impl;
|
||||||
|
|
||||||
|
LLAMA_API quantize_state_impl * llama_quant_init(
|
||||||
|
const llama_model * model,
|
||||||
|
const llama_model_quantize_params * params);
|
||||||
|
|
||||||
|
LLAMA_API void llama_quant_free(quantize_state_impl * qs);
|
||||||
|
|
||||||
|
// Descriptor for constructing a mock model for quantization testing.
|
||||||
|
struct llama_quant_model_desc {
|
||||||
|
const char * architecture;
|
||||||
|
uint32_t n_embd;
|
||||||
|
uint32_t n_ff;
|
||||||
|
uint32_t n_layer;
|
||||||
|
uint32_t n_head;
|
||||||
|
uint32_t n_head_kv;
|
||||||
|
uint32_t n_expert;
|
||||||
|
uint32_t n_embd_head_k;
|
||||||
|
uint32_t n_embd_head_v;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create a mock model from a metadata descriptor (for testing).
|
||||||
|
// The returned model must be freed with llama_model_free().
|
||||||
|
LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc);
|
||||||
|
|
||||||
|
// Returns true if this tensor should be quantized (based on name, dims, params).
|
||||||
|
LLAMA_API bool llama_quant_tensor_allows_quantization(
|
||||||
|
const quantize_state_impl * qs,
|
||||||
|
const ggml_tensor * tensor);
|
||||||
|
|
||||||
|
// Compute quantization type assignments for a list of tensors.
|
||||||
|
// All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter).
|
||||||
|
// result_types: caller-allocated array of n_tensors elements, filled with assigned types.
|
||||||
|
LLAMA_API void llama_quant_compute_types(
|
||||||
|
quantize_state_impl * qs,
|
||||||
|
llama_ftype ftype,
|
||||||
|
ggml_tensor ** tensors,
|
||||||
|
ggml_type * result_types,
|
||||||
|
size_t n_tensors);
|
||||||
|
|
||||||
|
//
|
||||||
|
// device memory querying
|
||||||
|
//
|
||||||
|
|
||||||
|
// "memory" as in physical memory for a buffer type, in bytes
|
||||||
|
struct llama_memory_breakdown_data {
|
||||||
|
size_t model = 0; // memory allocated for the model
|
||||||
|
size_t context = 0; // memory allocated for the context
|
||||||
|
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||||
|
|
||||||
|
size_t total() const {
|
||||||
|
return model + context + compute;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_device_memory_data {
|
||||||
|
int64_t total;
|
||||||
|
int64_t free;
|
||||||
|
llama_memory_breakdown_data mb;
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: convert to C-style data structure
|
||||||
|
using llama_memory_breakdown = std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data>;
|
||||||
|
|
||||||
|
LLAMA_API int32_t llama_model_n_expert (const struct llama_model * model);
|
||||||
|
LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
|
||||||
|
|
||||||
|
LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
|
||||||
|
|
||||||
|
LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <set>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
#define MAX_REPETITION_THRESHOLD 2000
|
#define MAX_REPETITION_THRESHOLD 2000
|
||||||
|
|
@ -454,6 +455,7 @@ const char * llama_grammar_parser::parse_sequence(
|
||||||
bool is_nested) {
|
bool is_nested) {
|
||||||
size_t last_sym_start = rule.size();
|
size_t last_sym_start = rule.size();
|
||||||
const char * pos = src;
|
const char * pos = src;
|
||||||
|
uint64_t n_prev_rules = 1;
|
||||||
|
|
||||||
// use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
|
// use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
|
||||||
// (though it's technically the same as -1 now)
|
// (though it's technically the same as -1 now)
|
||||||
|
|
@ -481,6 +483,18 @@ const char * llama_grammar_parser::parse_sequence(
|
||||||
// S' ::= S |
|
// S' ::= S |
|
||||||
|
|
||||||
llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
|
llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
|
||||||
|
// Calculate the total number of rules that will be generated by this repetition
|
||||||
|
uint64_t total_rules = 1; // Start with 1 for the original rule
|
||||||
|
if (!no_max && max_times > 0) {
|
||||||
|
total_rules = max_times;
|
||||||
|
} else if (min_times > 0) {
|
||||||
|
total_rules = min_times;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_prev_rules * total_rules >= MAX_REPETITION_THRESHOLD) {
|
||||||
|
throw std::runtime_error("number of rules that are going to be repeated multiplied by the new repetition exceeds sane defaults, please reduce the number of repetitions or rule complexity");
|
||||||
|
}
|
||||||
|
|
||||||
if (min_times == 0) {
|
if (min_times == 0) {
|
||||||
rule.resize(last_sym_start);
|
rule.resize(last_sym_start);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -508,12 +522,15 @@ const char * llama_grammar_parser::parse_sequence(
|
||||||
if (n_opt > 0) {
|
if (n_opt > 0) {
|
||||||
rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
||||||
}
|
}
|
||||||
|
n_prev_rules *= total_rules;
|
||||||
|
GGML_ASSERT(n_prev_rules >= 1);
|
||||||
};
|
};
|
||||||
|
|
||||||
while (*pos) {
|
while (*pos) {
|
||||||
if (*pos == '"') { // literal string
|
if (*pos == '"') { // literal string
|
||||||
pos++;
|
pos++;
|
||||||
last_sym_start = rule.size();
|
last_sym_start = rule.size();
|
||||||
|
n_prev_rules = 1;
|
||||||
while (*pos != '"') {
|
while (*pos != '"') {
|
||||||
if (!*pos) {
|
if (!*pos) {
|
||||||
throw std::runtime_error("unexpected end of input");
|
throw std::runtime_error("unexpected end of input");
|
||||||
|
|
@ -531,6 +548,7 @@ const char * llama_grammar_parser::parse_sequence(
|
||||||
start_type = LLAMA_GRETYPE_CHAR_NOT;
|
start_type = LLAMA_GRETYPE_CHAR_NOT;
|
||||||
}
|
}
|
||||||
last_sym_start = rule.size();
|
last_sym_start = rule.size();
|
||||||
|
n_prev_rules = 1;
|
||||||
while (*pos != ']') {
|
while (*pos != ']') {
|
||||||
if (!*pos) {
|
if (!*pos) {
|
||||||
throw std::runtime_error("unexpected end of input");
|
throw std::runtime_error("unexpected end of input");
|
||||||
|
|
@ -561,6 +579,7 @@ const char * llama_grammar_parser::parse_sequence(
|
||||||
auto token_pair = parse_token(vocab, pos);
|
auto token_pair = parse_token(vocab, pos);
|
||||||
const char * token_end = token_pair.second;
|
const char * token_end = token_pair.second;
|
||||||
last_sym_start = rule.size();
|
last_sym_start = rule.size();
|
||||||
|
n_prev_rules = 1;
|
||||||
rule.push_back({type, token_pair.first});
|
rule.push_back({type, token_pair.first});
|
||||||
pos = parse_space(token_end, is_nested);
|
pos = parse_space(token_end, is_nested);
|
||||||
} else if (is_word_char(*pos)) { // rule reference
|
} else if (is_word_char(*pos)) { // rule reference
|
||||||
|
|
@ -568,12 +587,15 @@ const char * llama_grammar_parser::parse_sequence(
|
||||||
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
||||||
pos = parse_space(name_end, is_nested);
|
pos = parse_space(name_end, is_nested);
|
||||||
last_sym_start = rule.size();
|
last_sym_start = rule.size();
|
||||||
|
n_prev_rules = 1;
|
||||||
rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
|
rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
|
||||||
} else if (*pos == '(') { // grouping
|
} else if (*pos == '(') { // grouping
|
||||||
// parse nested alternates into synthesized rule
|
// parse nested alternates into synthesized rule
|
||||||
pos = parse_space(pos + 1, true);
|
pos = parse_space(pos + 1, true);
|
||||||
|
uint32_t n_rules_before = symbol_ids.size();
|
||||||
uint32_t sub_rule_id = generate_symbol_id(rule_name);
|
uint32_t sub_rule_id = generate_symbol_id(rule_name);
|
||||||
pos = parse_alternates(pos, rule_name, sub_rule_id, true);
|
pos = parse_alternates(pos, rule_name, sub_rule_id, true);
|
||||||
|
n_prev_rules = std::max(1u, (uint32_t)symbol_ids.size() - n_rules_before);
|
||||||
last_sym_start = rule.size();
|
last_sym_start = rule.size();
|
||||||
// output reference to synthesized rule
|
// output reference to synthesized rule
|
||||||
rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
||||||
|
|
@ -583,6 +605,7 @@ const char * llama_grammar_parser::parse_sequence(
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
} else if (*pos == '.') { // any char
|
} else if (*pos == '.') { // any char
|
||||||
last_sym_start = rule.size();
|
last_sym_start = rule.size();
|
||||||
|
n_prev_rules = 1;
|
||||||
rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
} else if (*pos == '*') {
|
} else if (*pos == '*') {
|
||||||
|
|
@ -830,32 +853,54 @@ static bool llama_grammar_match_token(
|
||||||
static void llama_grammar_advance_stack(
|
static void llama_grammar_advance_stack(
|
||||||
const llama_grammar_rules & rules,
|
const llama_grammar_rules & rules,
|
||||||
const llama_grammar_stack & stack,
|
const llama_grammar_stack & stack,
|
||||||
llama_grammar_stacks & new_stacks) {
|
llama_grammar_stacks & new_stacks) {
|
||||||
if (stack.empty()) {
|
std::vector<llama_grammar_stack> todo;
|
||||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
todo.push_back(stack);
|
||||||
new_stacks.emplace_back(stack);
|
|
||||||
|
auto stack_cmp = [](const llama_grammar_stack & a, const llama_grammar_stack & b) {
|
||||||
|
return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(),
|
||||||
|
[](const llama_grammar_element * pa, const llama_grammar_element * pb) {
|
||||||
|
return pa < pb; // Compare pointer addresses
|
||||||
|
}
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
std::set<llama_grammar_stack, decltype(stack_cmp)> seen(stack_cmp);
|
||||||
|
|
||||||
|
while (!todo.empty()) {
|
||||||
|
llama_grammar_stack curr_stack = std::move(todo.back());
|
||||||
|
todo.pop_back();
|
||||||
|
|
||||||
|
if (seen.find( curr_stack) != seen.end()) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
return;
|
seen.insert(curr_stack);
|
||||||
}
|
|
||||||
|
|
||||||
const llama_grammar_element * pos = stack.back();
|
if (curr_stack.empty()) {
|
||||||
|
if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
|
||||||
|
new_stacks.emplace_back(std::move(curr_stack));
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
switch (pos->type) {
|
const llama_grammar_element * pos = curr_stack.back();
|
||||||
|
|
||||||
|
switch (pos->type) {
|
||||||
case LLAMA_GRETYPE_RULE_REF: {
|
case LLAMA_GRETYPE_RULE_REF: {
|
||||||
const size_t rule_id = static_cast<size_t>(pos->value);
|
const size_t rule_id = static_cast<size_t>(pos->value);
|
||||||
const llama_grammar_element * subpos = rules[rule_id].data();
|
const llama_grammar_element * subpos = rules[rule_id].data();
|
||||||
do {
|
do {
|
||||||
// init new stack without the top (pos)
|
// init new stack without the top (pos)
|
||||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
llama_grammar_stack next_stack(curr_stack.begin(), curr_stack.end() - 1);
|
||||||
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
||||||
// if this rule ref is followed by another element, add that to stack
|
// if this rule ref is followed by another element, add that to stack
|
||||||
new_stack.push_back(pos + 1);
|
next_stack.push_back(pos + 1);
|
||||||
}
|
}
|
||||||
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
||||||
// if alternate is nonempty, add to stack
|
// if alternate is nonempty, add to stack
|
||||||
new_stack.push_back(subpos);
|
next_stack.push_back(subpos);
|
||||||
}
|
}
|
||||||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
todo.push_back(std::move(next_stack));
|
||||||
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
||||||
// scan to end of alternate def
|
// scan to end of alternate def
|
||||||
subpos++;
|
subpos++;
|
||||||
|
|
@ -874,9 +919,9 @@ static void llama_grammar_advance_stack(
|
||||||
case LLAMA_GRETYPE_CHAR_ANY:
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
case LLAMA_GRETYPE_TOKEN:
|
case LLAMA_GRETYPE_TOKEN:
|
||||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
|
||||||
// only add the stack if it's not a duplicate of one we already have
|
// only add the stack if it's not a duplicate of one we already have
|
||||||
new_stacks.emplace_back(stack);
|
new_stacks.emplace_back(std::move(curr_stack));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|
@ -884,6 +929,7 @@ static void llama_grammar_advance_stack(
|
||||||
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
||||||
// those
|
// those
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
#include "llama-graph.h"
|
#include "llama-graph.h"
|
||||||
|
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
|
#include "llama-model.h"
|
||||||
#include "llama-batch.h"
|
#include "llama-batch.h"
|
||||||
#include "llama-cparams.h"
|
#include "llama-cparams.h"
|
||||||
|
|
||||||
|
|
@ -19,7 +20,7 @@
|
||||||
|
|
||||||
// dedup helpers
|
// dedup helpers
|
||||||
|
|
||||||
static ggml_tensor * build_kq_mask(
|
static ggml_tensor * build_attn_inp_kq_mask(
|
||||||
ggml_context * ctx,
|
ggml_context * ctx,
|
||||||
const llama_kv_cache_context * mctx,
|
const llama_kv_cache_context * mctx,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
|
|
@ -28,7 +29,11 @@ static ggml_tensor * build_kq_mask(
|
||||||
const auto n_tokens = ubatch.n_tokens;
|
const auto n_tokens = ubatch.n_tokens;
|
||||||
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
|
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
|
||||||
|
|
||||||
return ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
ggml_tensor * res = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||||
|
ggml_set_input(res);
|
||||||
|
ggml_set_name(res, "attn_inp_kq_mask");
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool can_reuse_kq_mask(
|
static bool can_reuse_kq_mask(
|
||||||
|
|
@ -52,6 +57,21 @@ static bool can_reuse_kq_mask(
|
||||||
|
|
||||||
// impl
|
// impl
|
||||||
|
|
||||||
|
static ggml_tensor * ggml_mul_mat_aux(
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_tensor * cur,
|
||||||
|
ggml_tensor * rot) {
|
||||||
|
const auto n = rot->ne[0];
|
||||||
|
|
||||||
|
ggml_tensor * res;
|
||||||
|
|
||||||
|
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
|
||||||
|
res = ggml_mul_mat (ctx, rot, res);
|
||||||
|
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
||||||
if (ubatch->token) {
|
if (ubatch->token) {
|
||||||
const int64_t n_tokens = ubatch->n_tokens;
|
const int64_t n_tokens = ubatch->n_tokens;
|
||||||
|
|
@ -429,6 +449,14 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
|
||||||
mctx->set_input_v_idxs(self_v_idxs, ubatch);
|
mctx->set_input_v_idxs(self_v_idxs, ubatch);
|
||||||
|
|
||||||
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||||
|
|
||||||
|
if (self_k_rot) {
|
||||||
|
mctx->set_input_k_rot(self_k_rot);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (self_v_rot) {
|
||||||
|
mctx->set_input_v_rot(self_v_rot);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
||||||
|
|
@ -476,6 +504,22 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||||
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
|
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
|
||||||
|
|
||||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||||
|
|
||||||
|
if (self_k_rot) {
|
||||||
|
mctx->get_base()->set_input_k_rot(self_k_rot);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (self_v_rot) {
|
||||||
|
mctx->get_base()->set_input_v_rot(self_v_rot);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (self_k_rot_swa) {
|
||||||
|
mctx->get_swa()->set_input_k_rot(self_k_rot_swa);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (self_v_rot_swa) {
|
||||||
|
mctx->get_swa()->set_input_v_rot(self_v_rot_swa);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
||||||
|
|
@ -532,6 +576,14 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
||||||
|
|
||||||
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
||||||
|
|
||||||
|
if (inp_attn->self_k_rot) {
|
||||||
|
mctx->get_attn()->set_input_k_rot(inp_attn->self_k_rot);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inp_attn->self_v_rot) {
|
||||||
|
mctx->get_attn()->set_input_v_rot(inp_attn->self_v_rot);
|
||||||
|
}
|
||||||
|
|
||||||
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||||
|
|
||||||
if (inp_rs->s_copy) {
|
if (inp_rs->s_copy) {
|
||||||
|
|
@ -630,6 +682,22 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
|
||||||
attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
|
attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (inp_attn->self_k_rot) {
|
||||||
|
attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inp_attn->self_v_rot) {
|
||||||
|
attn_ctx->get_base()->set_input_v_rot(inp_attn->self_v_rot);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inp_attn->self_k_rot_swa) {
|
||||||
|
attn_ctx->get_swa()->set_input_k_rot(inp_attn->self_k_rot_swa);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inp_attn->self_v_rot_swa) {
|
||||||
|
attn_ctx->get_swa()->set_input_v_rot(inp_attn->self_v_rot_swa);
|
||||||
|
}
|
||||||
|
|
||||||
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||||
|
|
||||||
if (inp_rs->s_copy) {
|
if (inp_rs->s_copy) {
|
||||||
|
|
@ -992,6 +1060,84 @@ ggml_tensor * llm_graph_context::build_norm(
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
llm_graph_qkv llm_graph_context::build_qkv(
|
||||||
|
const llama_layer & layer,
|
||||||
|
ggml_tensor * cur,
|
||||||
|
int64_t n_embd_head,
|
||||||
|
int64_t n_head,
|
||||||
|
int64_t n_head_kv,
|
||||||
|
int il) const {
|
||||||
|
const int64_t n_embd_q = n_embd_head * n_head;
|
||||||
|
const int64_t n_embd_kv = n_embd_head * n_head_kv;
|
||||||
|
|
||||||
|
ggml_tensor * Qcur, * Kcur, * Vcur;
|
||||||
|
|
||||||
|
if (layer.wqkv) {
|
||||||
|
// fused QKV path
|
||||||
|
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur, layer.wqkv_s);
|
||||||
|
cb(qkv, "wqkv", il);
|
||||||
|
if (layer.wqkv_b) {
|
||||||
|
qkv = ggml_add(ctx0, qkv, layer.wqkv_b);
|
||||||
|
cb(qkv, "wqkv_b", il);
|
||||||
|
}
|
||||||
|
if (hparams.f_clamp_kqv > 0.0f) {
|
||||||
|
qkv = ggml_clamp(ctx0, qkv, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||||
|
cb(qkv, "wqkv_clamped", il);
|
||||||
|
}
|
||||||
|
Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens,
|
||||||
|
ggml_row_size(qkv->type, n_embd_head), qkv->nb[1], 0);
|
||||||
|
Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
|
||||||
|
ggml_row_size(qkv->type, n_embd_head), qkv->nb[1],
|
||||||
|
ggml_row_size(qkv->type, n_embd_q));
|
||||||
|
Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
|
||||||
|
ggml_row_size(qkv->type, n_embd_head), qkv->nb[1],
|
||||||
|
ggml_row_size(qkv->type, n_embd_q + n_embd_kv));
|
||||||
|
} else {
|
||||||
|
// separate Q/K/V path
|
||||||
|
Qcur = build_lora_mm(layer.wq, cur, layer.wq_s);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (layer.wq_b) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, layer.wq_b);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
if (hparams.f_clamp_kqv > 0.0f) {
|
||||||
|
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||||
|
cb(Qcur, "Qcur_clamped", il);
|
||||||
|
}
|
||||||
|
Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (layer.wk_b) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, layer.wk_b);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
if (hparams.f_clamp_kqv > 0.0f) {
|
||||||
|
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||||
|
cb(Kcur, "Kcur_clamped", il);
|
||||||
|
}
|
||||||
|
Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (layer.wv_b) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, layer.wv_b);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
if (hparams.f_clamp_kqv > 0.0f) {
|
||||||
|
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||||
|
cb(Vcur, "Vcur_clamped", il);
|
||||||
|
}
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
return { Qcur, Kcur, Vcur };
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_ffn(
|
ggml_tensor * llm_graph_context::build_ffn(
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * up,
|
ggml_tensor * up,
|
||||||
|
|
@ -1516,9 +1662,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
|
|
||||||
if (!weight_before_ffn) {
|
if (!weight_before_ffn) {
|
||||||
experts = ggml_mul(ctx0, experts, weights);
|
experts = ggml_mul(ctx0, experts, weights);
|
||||||
cb(cur, "ffn_moe_weighted", il);
|
cb(experts, "ffn_moe_weighted", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, experts);
|
||||||
|
|
||||||
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
||||||
|
|
||||||
assert(n_expert_used > 0);
|
assert(n_expert_used > 0);
|
||||||
|
|
@ -1538,6 +1686,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
|
|
||||||
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
|
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
|
||||||
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
|
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, moe_out);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hparams.n_expert_used == 1) {
|
if (hparams.n_expert_used == 1) {
|
||||||
|
|
@ -1665,7 +1815,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
|
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
|
||||||
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
|
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
|
||||||
// but this would make the graph topology depend on the number of output tokens, which can interere with
|
// but this would make the graph topology depend on the number of output tokens, which can interfere with
|
||||||
// features that require constant topology such as pipeline parallelism
|
// features that require constant topology such as pipeline parallelism
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
|
||||||
//if (n_outputs < n_tokens) {
|
//if (n_outputs < n_tokens) {
|
||||||
|
|
@ -1940,6 +2090,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_no_cache * inp,
|
llm_graph_input_attn_no_cache * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur,
|
ggml_tensor * q_cur,
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
|
|
@ -1973,7 +2124,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
cur = build_lora_mm(wo, cur);
|
cur = build_lora_mm(wo, cur, wo_s);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wo_b) {
|
if (wo_b) {
|
||||||
|
|
@ -2002,13 +2153,13 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
||||||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
|
inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
|
||||||
|
|
||||||
ggml_set_input(inp->self_kq_mask);
|
|
||||||
|
|
||||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inp->self_k_rot = mctx_cur->build_input_k_rot(ctx0);
|
||||||
|
inp->self_v_rot = mctx_cur->build_input_v_rot(ctx0);
|
||||||
|
|
||||||
return inp;
|
return inp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2024,6 +2175,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_kv * inp,
|
llm_graph_input_attn_kv * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur,
|
ggml_tensor * q_cur,
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
|
|
@ -2034,6 +2186,15 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
int il) const {
|
int il) const {
|
||||||
GGML_ASSERT(v_mla == nullptr);
|
GGML_ASSERT(v_mla == nullptr);
|
||||||
|
|
||||||
|
if (inp->self_k_rot) {
|
||||||
|
q_cur = ggml_mul_mat_aux(ctx0, q_cur, inp->self_k_rot);
|
||||||
|
k_cur = ggml_mul_mat_aux(ctx0, k_cur, inp->self_k_rot);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inp->self_v_rot) {
|
||||||
|
v_cur = ggml_mul_mat_aux(ctx0, v_cur, inp->self_v_rot);
|
||||||
|
}
|
||||||
|
|
||||||
// these nodes are added to the graph together so that they are not reordered
|
// these nodes are added to the graph together so that they are not reordered
|
||||||
// by doing so, the number of splits in the graph is reduced
|
// by doing so, the number of splits in the graph is reduced
|
||||||
// expand k later to enable rope fusion which directly writes into k-v cache
|
// expand k later to enable rope fusion which directly writes into k-v cache
|
||||||
|
|
@ -2061,11 +2222,20 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
|
if (inp->self_v_rot) {
|
||||||
|
cur = ggml_mul_mat_aux(ctx0, cur, inp->self_v_rot);
|
||||||
|
}
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
cur = build_lora_mm(wo, cur);
|
|
||||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
|
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
|
||||||
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
|
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
|
||||||
|
cur = build_lora_mm(wo, cur);
|
||||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||||
|
if (wo_s) {
|
||||||
|
cur = ggml_mul(ctx0, cur, wo_s);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cur = build_lora_mm(wo, cur, wo_s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2090,9 +2260,7 @@ static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
|
||||||
|
|
||||||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
|
inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
|
||||||
ggml_set_input(inp->self_kq_mask);
|
|
||||||
|
|
||||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2111,6 +2279,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_k * inp,
|
llm_graph_input_attn_k * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur,
|
ggml_tensor * q_cur,
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
|
|
@ -2145,10 +2314,15 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
cur = build_lora_mm(wo, cur);
|
|
||||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
||||||
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
||||||
|
cur = build_lora_mm(wo, cur);
|
||||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||||
|
if (wo_s) {
|
||||||
|
cur = ggml_mul(ctx0, cur, wo_s);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cur = build_lora_mm(wo, cur, wo_s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2163,6 +2337,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_kv_iswa * inp,
|
llm_graph_input_attn_kv_iswa * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur,
|
ggml_tensor * q_cur,
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
|
|
@ -2171,6 +2346,23 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * v_mla,
|
ggml_tensor * v_mla,
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
|
const bool is_swa = hparams.is_swa(il);
|
||||||
|
|
||||||
|
auto * k_rot = is_swa ? inp->self_k_rot_swa : inp->self_k_rot;
|
||||||
|
auto * v_rot = is_swa ? inp->self_v_rot_swa : inp->self_v_rot;
|
||||||
|
|
||||||
|
if (k_rot) {
|
||||||
|
q_cur = ggml_mul_mat_aux(ctx0, q_cur, k_rot);
|
||||||
|
if (k_cur) {
|
||||||
|
k_cur = ggml_mul_mat_aux(ctx0, k_cur, k_rot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (v_rot) {
|
||||||
|
if (v_cur) {
|
||||||
|
v_cur = ggml_mul_mat_aux(ctx0, v_cur, v_rot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// these nodes are added to the graph together so that they are not reordered
|
// these nodes are added to the graph together so that they are not reordered
|
||||||
// by doing so, the number of splits in the graph is reduced
|
// by doing so, the number of splits in the graph is reduced
|
||||||
ggml_build_forward_expand(gf, q_cur);
|
ggml_build_forward_expand(gf, q_cur);
|
||||||
|
|
@ -2185,8 +2377,6 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
|
|
||||||
const auto * mctx_iswa = inp->mctx;
|
const auto * mctx_iswa = inp->mctx;
|
||||||
|
|
||||||
const bool is_swa = hparams.is_swa(il);
|
|
||||||
|
|
||||||
const auto * mctx_cur = is_swa ? mctx_iswa->get_swa() : mctx_iswa->get_base();
|
const auto * mctx_cur = is_swa ? mctx_iswa->get_swa() : mctx_iswa->get_base();
|
||||||
|
|
||||||
// optionally store to KV cache
|
// optionally store to KV cache
|
||||||
|
|
@ -2211,8 +2401,12 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
|
if (v_rot) {
|
||||||
|
cur = ggml_mul_mat_aux(ctx0, cur, v_rot);
|
||||||
|
}
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
cur = build_lora_mm(wo, cur);
|
cur = build_lora_mm(wo, cur, wo_s);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wo_b) {
|
if (wo_b) {
|
||||||
|
|
@ -2243,6 +2437,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_cross * inp,
|
llm_graph_input_attn_cross * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur,
|
ggml_tensor * q_cur,
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
|
|
@ -2267,7 +2462,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
cur = build_lora_mm(wo, cur);
|
cur = build_lora_mm(wo, cur, wo_s);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wo_b) {
|
if (wo_b) {
|
||||||
|
|
@ -2293,12 +2488,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||||
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams);
|
inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams);
|
||||||
ggml_set_input(inp->self_kq_mask);
|
|
||||||
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
|
|
||||||
|
|
||||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||||
ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|
@ -2307,14 +2498,16 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||||
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask_swa = build_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams);
|
inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams);
|
||||||
ggml_set_input(inp->self_kq_mask_swa);
|
|
||||||
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
|
|
||||||
|
|
||||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||||
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inp->self_k_rot = mctx_cur->get_base()->build_input_k_rot(ctx0);
|
||||||
|
inp->self_v_rot = mctx_cur->get_base()->build_input_v_rot(ctx0);
|
||||||
|
|
||||||
|
inp->self_k_rot_swa = mctx_cur->get_swa()->build_input_k_rot(ctx0);
|
||||||
|
inp->self_v_rot_swa = mctx_cur->get_swa()->build_input_v_rot(ctx0);
|
||||||
|
|
||||||
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
|
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2348,7 +2541,7 @@ ggml_tensor * llm_graph_context::build_rs(
|
||||||
ggml_build_forward_expand(gf,
|
ggml_build_forward_expand(gf,
|
||||||
ggml_cpy(ctx0,
|
ggml_cpy(ctx0,
|
||||||
states_extra,
|
states_extra,
|
||||||
ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
|
ggml_view_2d(ctx0, s, state_size, (n_rs - n_seqs), s->nb[1], (rs_head + n_seqs)*s->nb[1])));
|
||||||
|
|
||||||
return output_states;
|
return output_states;
|
||||||
}
|
}
|
||||||
|
|
@ -2473,9 +2666,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
|
||||||
inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
|
inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
|
inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp_attn->self_kq_mask = build_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams);
|
inp_attn->self_kq_mask = build_attn_inp_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams);
|
||||||
ggml_set_input(inp_attn->self_kq_mask);
|
|
||||||
|
|
||||||
inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
|
inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2483,9 +2674,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
|
||||||
inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp_attn->self_kq_mask_swa = build_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams);
|
inp_attn->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams);
|
||||||
ggml_set_input(inp_attn->self_kq_mask_swa);
|
|
||||||
|
|
||||||
inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
|
inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ struct ggml_context;
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
|
|
||||||
struct llama_cparams;
|
struct llama_cparams;
|
||||||
|
struct llama_layer;
|
||||||
|
|
||||||
struct llama_memory_context_i;
|
struct llama_memory_context_i;
|
||||||
|
|
||||||
|
|
@ -308,6 +309,10 @@ public:
|
||||||
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||||
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||||
|
|
||||||
|
// note: assumes v_rot^2 == I
|
||||||
|
ggml_tensor * self_k_rot = nullptr;
|
||||||
|
ggml_tensor * self_v_rot = nullptr;
|
||||||
|
|
||||||
// note: these have to be copies because in order to be able to reuse a graph, its inputs
|
// note: these have to be copies because in order to be able to reuse a graph, its inputs
|
||||||
// need to carry these parameters with them. otherwise, they can point to freed
|
// need to carry these parameters with them. otherwise, they can point to freed
|
||||||
// llm_graph_params from a previous batch, causing stack-use-after-return
|
// llm_graph_params from a previous batch, causing stack-use-after-return
|
||||||
|
|
@ -384,6 +389,12 @@ public:
|
||||||
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||||
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||||
|
|
||||||
|
ggml_tensor * self_k_rot = nullptr;
|
||||||
|
ggml_tensor * self_v_rot = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor * self_k_rot_swa = nullptr;
|
||||||
|
ggml_tensor * self_v_rot_swa = nullptr;
|
||||||
|
|
||||||
const llama_hparams hparams;
|
const llama_hparams hparams;
|
||||||
const llama_cparams cparams;
|
const llama_cparams cparams;
|
||||||
|
|
||||||
|
|
@ -697,6 +708,12 @@ using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
|
||||||
// used in build_rs to properly order writes and avoid unnecessary copies
|
// used in build_rs to properly order writes and avoid unnecessary copies
|
||||||
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
||||||
|
|
||||||
|
struct llm_graph_qkv {
|
||||||
|
ggml_tensor * q; // [n_embd_head, n_head, n_tokens]
|
||||||
|
ggml_tensor * k; // [n_embd_head, n_head_kv, n_tokens]
|
||||||
|
ggml_tensor * v; // [n_embd_head, n_head_kv, n_tokens]
|
||||||
|
};
|
||||||
|
|
||||||
struct llm_graph_context {
|
struct llm_graph_context {
|
||||||
const llm_arch arch;
|
const llm_arch arch;
|
||||||
|
|
||||||
|
|
@ -783,6 +800,17 @@ struct llm_graph_context {
|
||||||
llm_norm_type type,
|
llm_norm_type type,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
|
|
||||||
|
// compute Q, K, V projections with optional bias and reshape
|
||||||
|
// supports both fused wqkv and separate wq/wk/wv paths
|
||||||
|
llm_graph_qkv build_qkv(
|
||||||
|
const llama_layer & layer,
|
||||||
|
ggml_tensor * cur,
|
||||||
|
int64_t n_embd_head,
|
||||||
|
int64_t n_head,
|
||||||
|
int64_t n_head_kv,
|
||||||
|
int il) const;
|
||||||
|
|
||||||
ggml_tensor * build_ffn(
|
ggml_tensor * build_ffn(
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * up,
|
ggml_tensor * up,
|
||||||
|
|
@ -882,6 +910,7 @@ struct llm_graph_context {
|
||||||
llm_graph_input_attn_no_cache * inp,
|
llm_graph_input_attn_no_cache * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
|
|
@ -897,6 +926,7 @@ struct llm_graph_context {
|
||||||
llm_graph_input_attn_kv * inp,
|
llm_graph_input_attn_kv * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
|
|
@ -912,6 +942,7 @@ struct llm_graph_context {
|
||||||
llm_graph_input_attn_k * inp,
|
llm_graph_input_attn_k * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
|
|
@ -928,6 +959,7 @@ struct llm_graph_context {
|
||||||
llm_graph_input_attn_kv_iswa * inp,
|
llm_graph_input_attn_kv_iswa * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
||||||
|
|
@ -943,6 +975,7 @@ struct llm_graph_context {
|
||||||
llm_graph_input_attn_cross * inp,
|
llm_graph_input_attn_cross * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * wo_s,
|
||||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
|
|
|
||||||
|
|
@ -116,6 +116,7 @@ struct llama_hparams {
|
||||||
float rope_freq_base_train_swa = 10000.0f;
|
float rope_freq_base_train_swa = 10000.0f;
|
||||||
float rope_freq_scale_train;
|
float rope_freq_scale_train;
|
||||||
float rope_freq_scale_train_swa = 1.0f;
|
float rope_freq_scale_train_swa = 1.0f;
|
||||||
|
float rope_scaling_alpha = 0.0f; // NTK-aware alpha for XDRoPE
|
||||||
|
|
||||||
uint32_t n_ctx_orig_yarn;
|
uint32_t n_ctx_orig_yarn;
|
||||||
float rope_yarn_log_mul = 0.0f;
|
float rope_yarn_log_mul = 0.0f;
|
||||||
|
|
@ -209,6 +210,9 @@ struct llama_hparams {
|
||||||
// qwen3vl deepstack
|
// qwen3vl deepstack
|
||||||
uint32_t n_deepstack_layers = 0;
|
uint32_t n_deepstack_layers = 0;
|
||||||
|
|
||||||
|
// gemma4 per-layer embedding
|
||||||
|
uint32_t n_embd_per_layer = 0;
|
||||||
|
|
||||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/8141
|
// ref: https://github.com/ggml-org/llama.cpp/pull/8141
|
||||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
|
||||||
|
|
@ -128,7 +128,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
||||||
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
||||||
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
||||||
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
||||||
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
case GGUF_TYPE_BOOL: return ((const int8_t *)data)[i] != 0 ? "true" : "false";
|
||||||
default: return format("unknown type %d", type);
|
default: return format("unknown type %d", type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,65 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
|
static bool ggml_is_power_of_2(int n) {
|
||||||
|
return (n & (n - 1)) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// orthonormal Walsh-Hadamard rotation matrix
|
||||||
|
// note: res^2 == I
|
||||||
|
static void ggml_gen_hadamard(ggml_tensor * tensor) {
|
||||||
|
assert(tensor->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const int n = tensor->ne[0];
|
||||||
|
|
||||||
|
assert(ggml_is_power_of_2(n));
|
||||||
|
assert(tensor->ne[1] == n);
|
||||||
|
assert(tensor->ne[2] == 1);
|
||||||
|
assert(tensor->ne[3] == 1);
|
||||||
|
|
||||||
|
std::vector<float> data_f32;
|
||||||
|
|
||||||
|
float * data = (float *) tensor->data;
|
||||||
|
|
||||||
|
if (tensor->type != GGML_TYPE_F32) {
|
||||||
|
data_f32.resize(n*n);
|
||||||
|
data = data_f32.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
data[0*n + 0] = 1.0 / sqrtf(n);
|
||||||
|
|
||||||
|
for (int s = 1; s < n; s *= 2) {
|
||||||
|
for (int i = 0; i < s; i++) {
|
||||||
|
for (int j = 0; j < s; j++) {
|
||||||
|
const float val = data[i*n + j];
|
||||||
|
|
||||||
|
data[(i + s)*n + (j )] = val;
|
||||||
|
data[(i )*n + (j + s)] = val;
|
||||||
|
data[(i + s)*n + (j + s)] = -val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tensor->type != GGML_TYPE_F32) {
|
||||||
|
ggml_quantize_chunk(tensor->type, data, tensor->data, 0, 1, n*n, nullptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_tensor * ggml_mul_mat_aux(
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_tensor * cur,
|
||||||
|
ggml_tensor * rot) {
|
||||||
|
const auto n = rot->ne[0];
|
||||||
|
|
||||||
|
ggml_tensor * res;
|
||||||
|
|
||||||
|
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
|
||||||
|
res = ggml_mul_mat (ctx, rot, res);
|
||||||
|
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_kv_cache
|
// llama_kv_cache
|
||||||
//
|
//
|
||||||
|
|
@ -110,6 +169,18 @@ llama_kv_cache::llama_kv_cache(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (n_embd_head_k_all == 0) {
|
||||||
|
n_embd_head_k_all = (int32_t) hparams.n_embd_head_k(il);
|
||||||
|
} else if (n_embd_head_k_all > 0 && n_embd_head_k_all != (int32_t) hparams.n_embd_head_k(il)) {
|
||||||
|
n_embd_head_k_all = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_embd_head_v_all == 0) {
|
||||||
|
n_embd_head_v_all = (int32_t) hparams.n_embd_head_v(il);
|
||||||
|
} else if (n_embd_head_v_all > 0 && n_embd_head_v_all != (int32_t) hparams.n_embd_head_v(il)) {
|
||||||
|
n_embd_head_v_all = -1;
|
||||||
|
}
|
||||||
|
|
||||||
// [TAG_V_CACHE_VARIABLE]
|
// [TAG_V_CACHE_VARIABLE]
|
||||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||||
const uint32_t n_embd_v_gqa = !v_trans ? hparams.n_embd_v_gqa(il) : hparams.n_embd_v_gqa_max();
|
const uint32_t n_embd_v_gqa = !v_trans ? hparams.n_embd_v_gqa(il) : hparams.n_embd_v_gqa_max();
|
||||||
|
|
@ -209,6 +280,48 @@ llama_kv_cache::llama_kv_cache(
|
||||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE");
|
||||||
|
const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false;
|
||||||
|
if (attn_rot_disable) {
|
||||||
|
LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
attn_rot_k =
|
||||||
|
!attn_rot_disable &&
|
||||||
|
n_embd_head_k_all > 0 &&
|
||||||
|
ggml_is_quantized(type_k) &&
|
||||||
|
hparams.n_embd_head_k() % 64 == 0;
|
||||||
|
|
||||||
|
attn_rot_v =
|
||||||
|
!attn_rot_disable &&
|
||||||
|
n_embd_head_v_all > 0 &&
|
||||||
|
ggml_is_quantized(type_v) &&
|
||||||
|
hparams.n_embd_head_v() % 64 == 0;
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: attn_rot_k = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_k, n_embd_head_k_all);
|
||||||
|
LLAMA_LOG_INFO("%s: attn_rot_v = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_v, n_embd_head_v_all);
|
||||||
|
|
||||||
|
// pre-compute the haramard matrices and keep them in host memory
|
||||||
|
// TODO: in the future, we can make copies in the backend buffers to avoid host -> device transfers
|
||||||
|
if (attn_rot_k || attn_rot_v) {
|
||||||
|
for (int64_t n = 64; n <= std::max(n_embd_head_k_all, n_embd_head_v_all); n *= 2) {
|
||||||
|
attn_rot_hadamard[n] = std::vector<float>(n*n);
|
||||||
|
|
||||||
|
ggml_init_params params = {
|
||||||
|
/* .mem_size = */ 1*ggml_tensor_overhead(),
|
||||||
|
/* .mem_buffer = */ nullptr,
|
||||||
|
/* .no_alloc = */ true,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_context_ptr ctx { ggml_init(params) };
|
||||||
|
|
||||||
|
ggml_tensor * tmp = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_F32, n, n);
|
||||||
|
tmp->data = attn_rot_hadamard[n].data();
|
||||||
|
|
||||||
|
ggml_gen_hadamard(tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
||||||
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||||
}
|
}
|
||||||
|
|
@ -1004,6 +1117,14 @@ bool llama_kv_cache::get_has_shift() const {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_type llama_kv_cache::type_k() const {
|
||||||
|
return layers[0].k->type;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_type llama_kv_cache::type_v() const {
|
||||||
|
return layers[0].v->type;
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
|
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
|
||||||
uint32_t result = 0;
|
uint32_t result = 0;
|
||||||
|
|
||||||
|
|
@ -1189,6 +1310,47 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
|
||||||
return v_idxs;
|
return v_idxs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_tensor * llama_kv_cache::build_input_k_rot(ggml_context * ctx) const {
|
||||||
|
ggml_tensor * res = nullptr;
|
||||||
|
|
||||||
|
if (attn_rot_k) {
|
||||||
|
int nrot = 64;
|
||||||
|
|
||||||
|
// TODO: investigate if using the smallest rotation matrix is beneficial also for K (similar as for V)
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
|
||||||
|
do {
|
||||||
|
nrot *= 2;
|
||||||
|
} while (n_embd_head_k_all % nrot == 0);
|
||||||
|
nrot /= 2;
|
||||||
|
|
||||||
|
res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
|
||||||
|
ggml_set_input(res);
|
||||||
|
ggml_set_name(res, "attn_inp_k_rot");
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * llama_kv_cache::build_input_v_rot(ggml_context * ctx) const {
|
||||||
|
ggml_tensor * res = nullptr;
|
||||||
|
|
||||||
|
if (attn_rot_v) {
|
||||||
|
int nrot = 64;
|
||||||
|
// using smaller rotation matrices for V seems beneficial
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4146397570
|
||||||
|
//do {
|
||||||
|
// nrot *= 2;
|
||||||
|
//} while (hparams.n_embd_head_v() % nrot == 0);
|
||||||
|
//nrot /= 2;
|
||||||
|
|
||||||
|
res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
|
||||||
|
ggml_set_input(res);
|
||||||
|
ggml_set_name(res, "attn_inp_v_rot");
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
||||||
const uint32_t n_tokens = ubatch->n_tokens;
|
const uint32_t n_tokens = ubatch->n_tokens;
|
||||||
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
|
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
|
||||||
|
|
@ -1507,6 +1669,24 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_kv_cache::set_input_k_rot(ggml_tensor * dst) const {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
||||||
|
|
||||||
|
const auto n_rot = dst->ne[0];
|
||||||
|
GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0]));
|
||||||
|
|
||||||
|
memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst));
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_kv_cache::set_input_v_rot(ggml_tensor * dst) const {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
||||||
|
|
||||||
|
const auto n_rot = dst->ne[0];
|
||||||
|
GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0]));
|
||||||
|
|
||||||
|
memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst));
|
||||||
|
}
|
||||||
|
|
||||||
size_t llama_kv_cache::total_size() const {
|
size_t llama_kv_cache::total_size() const {
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
|
|
||||||
|
|
@ -1542,6 +1722,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||||
ggml_context * ctx,
|
ggml_context * ctx,
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * shift,
|
ggml_tensor * shift,
|
||||||
|
ggml_tensor * rot,
|
||||||
ggml_tensor * factors,
|
ggml_tensor * factors,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale,
|
float freq_scale,
|
||||||
|
|
@ -1561,17 +1742,22 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
|
||||||
? LLAMA_ROPE_TYPE_NEOX
|
? LLAMA_ROPE_TYPE_NEOX
|
||||||
: hparams.rope_type;
|
: hparams.rope_type;
|
||||||
|
|
||||||
ggml_tensor * tmp;
|
ggml_tensor * tmp;
|
||||||
|
|
||||||
if (ggml_is_quantized(cur->type)) {
|
if (ggml_is_quantized(cur->type)) {
|
||||||
// dequantize to f32 -> RoPE -> quantize back
|
// dequantize to f32 -> RoPE -> quantize back
|
||||||
tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
||||||
|
|
||||||
|
// rotate back
|
||||||
|
tmp = ggml_mul_mat_aux(ctx, tmp, rot);
|
||||||
|
|
||||||
tmp = ggml_rope_ext(ctx, tmp,
|
tmp = ggml_rope_ext(ctx, tmp,
|
||||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||||
|
|
||||||
|
// rotate fwd
|
||||||
|
tmp = ggml_mul_mat_aux(ctx, tmp, rot);
|
||||||
|
|
||||||
tmp = ggml_cpy(ctx, tmp, cur);
|
tmp = ggml_cpy(ctx, tmp, cur);
|
||||||
} else {
|
} else {
|
||||||
// we rotate only the first n_rot dimensions
|
// we rotate only the first n_rot dimensions
|
||||||
|
|
@ -1592,6 +1778,9 @@ public:
|
||||||
|
|
||||||
ggml_tensor * k_shift; // I32 [kv_size*n_stream]
|
ggml_tensor * k_shift; // I32 [kv_size*n_stream]
|
||||||
|
|
||||||
|
// note: assumes k_rot^2 == I
|
||||||
|
ggml_tensor * k_rot = nullptr;
|
||||||
|
|
||||||
const llama_kv_cache * kv_self;
|
const llama_kv_cache * kv_self;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -1601,6 +1790,10 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
||||||
if (k_shift) {
|
if (k_shift) {
|
||||||
kv_self->set_input_k_shift(k_shift);
|
kv_self->set_input_k_shift(k_shift);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (k_rot) {
|
||||||
|
kv_self->set_input_k_rot(k_rot);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
|
ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
|
||||||
|
|
@ -1612,6 +1805,8 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
||||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
|
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
|
||||||
ggml_set_input(inp->k_shift);
|
ggml_set_input(inp->k_shift);
|
||||||
|
|
||||||
|
inp->k_rot = build_input_k_rot(ctx);
|
||||||
|
|
||||||
const auto & cparams = lctx->get_cparams();
|
const auto & cparams = lctx->get_cparams();
|
||||||
|
|
||||||
for (const auto & layer : layers) {
|
for (const auto & layer : layers) {
|
||||||
|
|
@ -1636,7 +1831,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
||||||
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
||||||
ggml_row_size(layer.k->type, n_embd_nope));
|
ggml_row_size(layer.k->type, n_embd_nope));
|
||||||
|
|
||||||
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
|
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, inp->k_rot, rope_factors, freq_base_l, freq_scale_l, il);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
}
|
}
|
||||||
|
|
@ -2240,6 +2435,14 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
|
||||||
return n_kv;
|
return n_kv;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_type llama_kv_cache_context::type_k() const {
|
||||||
|
return kv->type_k();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_type llama_kv_cache_context::type_v() const {
|
||||||
|
return kv->type_v();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
|
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
|
||||||
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
|
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
|
||||||
}
|
}
|
||||||
|
|
@ -2264,6 +2467,14 @@ ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, con
|
||||||
return kv->build_input_v_idxs(ctx, ubatch);
|
return kv->build_input_v_idxs(ctx, ubatch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_tensor * llama_kv_cache_context::build_input_k_rot(ggml_context * ctx) const {
|
||||||
|
return kv->build_input_k_rot(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * llama_kv_cache_context::build_input_v_rot(ggml_context * ctx) const {
|
||||||
|
return kv->build_input_v_rot(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const {
|
void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const {
|
||||||
kv->set_input_k_shift(dst);
|
kv->set_input_k_shift(dst);
|
||||||
}
|
}
|
||||||
|
|
@ -2283,3 +2494,11 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
||||||
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
||||||
kv->set_input_pos_bucket(dst, ubatch);
|
kv->set_input_pos_bucket(dst, ubatch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_kv_cache_context::set_input_k_rot(ggml_tensor * dst) const {
|
||||||
|
kv->set_input_k_rot(dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_kv_cache_context::set_input_v_rot(ggml_tensor * dst) const {
|
||||||
|
kv->set_input_v_rot(dst);
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -152,6 +152,9 @@ public:
|
||||||
|
|
||||||
bool get_has_shift() const;
|
bool get_has_shift() const;
|
||||||
|
|
||||||
|
ggml_type type_k() const;
|
||||||
|
ggml_type type_v() const;
|
||||||
|
|
||||||
//
|
//
|
||||||
// graph_build API
|
// graph_build API
|
||||||
//
|
//
|
||||||
|
|
@ -191,6 +194,9 @@ public:
|
||||||
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||||
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||||
|
|
||||||
|
ggml_tensor * build_input_k_rot(ggml_context * ctx) const;
|
||||||
|
ggml_tensor * build_input_v_rot(ggml_context * ctx) const;
|
||||||
|
|
||||||
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
||||||
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
||||||
|
|
||||||
|
|
@ -199,6 +205,9 @@ public:
|
||||||
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
|
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
|
||||||
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||||
|
|
||||||
|
void set_input_k_rot(ggml_tensor * dst) const;
|
||||||
|
void set_input_v_rot(ggml_tensor * dst) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const llama_model & model;
|
const llama_model & model;
|
||||||
const llama_hparams & hparams;
|
const llama_hparams & hparams;
|
||||||
|
|
@ -226,6 +235,18 @@ private:
|
||||||
// SWA
|
// SWA
|
||||||
const uint32_t n_swa = 0;
|
const uint32_t n_swa = 0;
|
||||||
|
|
||||||
|
// env: LLAMA_ATTN_ROT_DISABLE
|
||||||
|
bool attn_rot_k = false;
|
||||||
|
bool attn_rot_v = false;
|
||||||
|
|
||||||
|
// if all layers participating in the cache have constant head size, the value is stored here
|
||||||
|
// otherwise the value is -1
|
||||||
|
int32_t n_embd_head_k_all = 0;
|
||||||
|
int32_t n_embd_head_v_all = 0;
|
||||||
|
|
||||||
|
// pre-computed hadamard martrices
|
||||||
|
std::unordered_map<int64_t, std::vector<float>> attn_rot_hadamard;
|
||||||
|
|
||||||
// env: LLAMA_KV_CACHE_DEBUG
|
// env: LLAMA_KV_CACHE_DEBUG
|
||||||
int debug = 0;
|
int debug = 0;
|
||||||
|
|
||||||
|
|
@ -262,6 +283,7 @@ private:
|
||||||
ggml_context * ctx,
|
ggml_context * ctx,
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * shift,
|
ggml_tensor * shift,
|
||||||
|
ggml_tensor * rot,
|
||||||
ggml_tensor * factors,
|
ggml_tensor * factors,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale,
|
float freq_scale,
|
||||||
|
|
@ -328,12 +350,15 @@ public:
|
||||||
|
|
||||||
uint32_t get_n_kv() const;
|
uint32_t get_n_kv() const;
|
||||||
|
|
||||||
|
ggml_type type_k() const;
|
||||||
|
ggml_type type_v() const;
|
||||||
|
|
||||||
// get views of the current state of the cache
|
// get views of the current state of the cache
|
||||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
||||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||||
|
|
||||||
// store k_cur and v_cur in the cache based on the provided head location
|
// store k_cur and v_cur in the cache based on the provided head location
|
||||||
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
|
// note: the heads in k_cur and v_cur should be laid out contiguously in memory
|
||||||
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
||||||
// - k_idxs [n_tokens]
|
// - k_idxs [n_tokens]
|
||||||
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
|
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
|
||||||
|
|
@ -347,6 +372,9 @@ public:
|
||||||
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||||
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||||
|
|
||||||
|
ggml_tensor * build_input_k_rot(ggml_context * ctx) const;
|
||||||
|
ggml_tensor * build_input_v_rot(ggml_context * ctx) const;
|
||||||
|
|
||||||
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||||
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||||
|
|
||||||
|
|
@ -354,6 +382,9 @@ public:
|
||||||
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
|
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
|
||||||
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||||
|
|
||||||
|
void set_input_k_rot(ggml_tensor * dst) const;
|
||||||
|
void set_input_v_rot(ggml_tensor * dst) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
llama_memory_status status;
|
llama_memory_status status;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -73,9 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr
|
||||||
// if all tokens are output, split by sequence
|
// if all tokens are output, split by sequence
|
||||||
ubatch = balloc.split_seq(n_ubatch);
|
ubatch = balloc.split_seq(n_ubatch);
|
||||||
} else {
|
} else {
|
||||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||||
// for simplicity, we always use sequential equal split for now
|
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
|
||||||
ubatch = balloc.split_equal(n_ubatch, true);
|
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ubatch.n_tokens == 0) {
|
if (ubatch.n_tokens == 0) {
|
||||||
|
|
|
||||||
|
|
@ -73,9 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
||||||
// if all tokens are output, split by sequence
|
// if all tokens are output, split by sequence
|
||||||
ubatch = balloc.split_seq(n_ubatch);
|
ubatch = balloc.split_seq(n_ubatch);
|
||||||
} else {
|
} else {
|
||||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||||
// for simplicity, we always use sequential equal split for now
|
const bool unified = (mem_attn->get_n_stream() == 1);
|
||||||
ubatch = balloc.split_equal(n_ubatch, true);
|
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ubatch.n_tokens == 0) {
|
if (ubatch.n_tokens == 0) {
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
#include "llama-memory-recurrent.h"
|
#include "llama-memory-recurrent.h"
|
||||||
|
|
||||||
|
#include "ggml-backend.h"
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
#include "llama-io.h"
|
#include "llama-io.h"
|
||||||
#include "llama-batch.h"
|
#include "llama-batch.h"
|
||||||
|
|
@ -91,8 +92,8 @@ llama_memory_recurrent::llama_memory_recurrent(
|
||||||
throw std::runtime_error("failed to create ggml context for rs cache");
|
throw std::runtime_error("failed to create ggml context for rs cache");
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
|
ggml_tensor * r = ggml_new_tensor_2d(ctx, type_r, hparams.n_embd_r(), mem_size);
|
||||||
ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size);
|
ggml_tensor * s = ggml_new_tensor_2d(ctx, type_s, hparams.n_embd_s(), mem_size);
|
||||||
ggml_format_name(r, "cache_r_l%d", i);
|
ggml_format_name(r, "cache_r_l%d", i);
|
||||||
ggml_format_name(s, "cache_s_l%d", i);
|
ggml_format_name(s, "cache_s_l%d", i);
|
||||||
r_l[i] = r;
|
r_l[i] = r;
|
||||||
|
|
@ -928,11 +929,8 @@ bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||||
llama_seq_id seq_id;
|
llama_seq_id seq_id;
|
||||||
io.read_to(&seq_id, sizeof(seq_id));
|
io.read_to(&seq_id, sizeof(seq_id));
|
||||||
|
|
||||||
// TODO: llama_memory_recurrent should have a notion of max sequences
|
if (seq_id < 0 || (uint32_t) seq_id >= this->n_seq_max) {
|
||||||
//if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
|
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, this->n_seq_max);
|
||||||
if (seq_id < 0) {
|
|
||||||
//LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
|
|
||||||
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,14 @@
|
||||||
#include <TargetConditionals.h>
|
#include <TargetConditionals.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
# define llama_mmap_ftell _ftelli64
|
||||||
|
# define llama_mmap_fseek _fseeki64
|
||||||
|
#else
|
||||||
|
# define llama_mmap_ftell ftello
|
||||||
|
# define llama_mmap_fseek fseeko
|
||||||
|
#endif
|
||||||
|
|
||||||
// TODO: consider moving to llama-impl.h if needed in more places
|
// TODO: consider moving to llama-impl.h if needed in more places
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
static std::string llama_format_win_err(DWORD err) {
|
static std::string llama_format_win_err(DWORD err) {
|
||||||
|
|
@ -86,6 +94,14 @@ struct llama_file::impl {
|
||||||
seek(0, SEEK_SET);
|
seek(0, SEEK_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl(FILE * file) : owns_fp(false) {
|
||||||
|
fp = file;
|
||||||
|
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
|
||||||
|
seek(0, SEEK_END);
|
||||||
|
size = tell();
|
||||||
|
seek(0, SEEK_SET);
|
||||||
|
}
|
||||||
|
|
||||||
size_t tell() const {
|
size_t tell() const {
|
||||||
LARGE_INTEGER li;
|
LARGE_INTEGER li;
|
||||||
li.QuadPart = 0;
|
li.QuadPart = 0;
|
||||||
|
|
@ -159,7 +175,7 @@ struct llama_file::impl {
|
||||||
}
|
}
|
||||||
|
|
||||||
~impl() {
|
~impl() {
|
||||||
if (fp) {
|
if (fp && owns_fp) {
|
||||||
std::fclose(fp);
|
std::fclose(fp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -209,9 +225,16 @@ struct llama_file::impl {
|
||||||
seek(0, SEEK_SET);
|
seek(0, SEEK_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl(FILE * file) : fname("(file*)"), owns_fp(false) {
|
||||||
|
fp = file;
|
||||||
|
seek(0, SEEK_END);
|
||||||
|
size = tell();
|
||||||
|
seek(0, SEEK_SET);
|
||||||
|
}
|
||||||
|
|
||||||
size_t tell() const {
|
size_t tell() const {
|
||||||
if (fd == -1) {
|
if (fd == -1) {
|
||||||
long ret = std::ftell(fp);
|
off_t ret = llama_mmap_ftell(fp);
|
||||||
if (ret == -1) {
|
if (ret == -1) {
|
||||||
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
||||||
}
|
}
|
||||||
|
|
@ -229,7 +252,7 @@ struct llama_file::impl {
|
||||||
void seek(size_t offset, int whence) const {
|
void seek(size_t offset, int whence) const {
|
||||||
off_t ret = 0;
|
off_t ret = 0;
|
||||||
if (fd == -1) {
|
if (fd == -1) {
|
||||||
ret = std::fseek(fp, (long) offset, whence);
|
ret = llama_mmap_fseek(fp, offset, whence);
|
||||||
} else {
|
} else {
|
||||||
ret = lseek(fd, offset, whence);
|
ret = lseek(fd, offset, whence);
|
||||||
}
|
}
|
||||||
|
|
@ -353,7 +376,7 @@ struct llama_file::impl {
|
||||||
~impl() {
|
~impl() {
|
||||||
if (fd != -1) {
|
if (fd != -1) {
|
||||||
close(fd);
|
close(fd);
|
||||||
} else {
|
} else if (owns_fp) {
|
||||||
std::fclose(fp);
|
std::fclose(fp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -369,10 +392,14 @@ struct llama_file::impl {
|
||||||
|
|
||||||
FILE * fp{};
|
FILE * fp{};
|
||||||
size_t size{};
|
size_t size{};
|
||||||
|
bool owns_fp = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
||||||
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
||||||
|
|
||||||
|
llama_file::llama_file(FILE * file) : pimpl(std::make_unique<impl>(file)) {}
|
||||||
|
|
||||||
llama_file::~llama_file() = default;
|
llama_file::~llama_file() = default;
|
||||||
|
|
||||||
size_t llama_file::tell() const { return pimpl->tell(); }
|
size_t llama_file::tell() const { return pimpl->tell(); }
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||||
|
|
||||||
struct llama_file {
|
struct llama_file {
|
||||||
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
||||||
|
llama_file(FILE * file);
|
||||||
~llama_file();
|
~llama_file();
|
||||||
|
|
||||||
size_t tell() const;
|
size_t tell() const;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||||
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q1_0: return "Q1_0";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
||||||
|
|
@ -374,8 +375,9 @@ namespace GGUFMeta {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (arr_info.gt == GGUF_TYPE_BOOL) {
|
if (arr_info.gt == GGUF_TYPE_BOOL) {
|
||||||
std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
|
const int8_t * values = (const int8_t *) arr_info.data;
|
||||||
return static_cast<T>(x);
|
std::transform(values, values + arr_info.length, result.begin(), [](int8_t x) {
|
||||||
|
return static_cast<T>(x != 0);
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
||||||
|
|
@ -511,6 +513,7 @@ llama_model_loader::llama_model_loader(
|
||||||
void * set_tensor_data_ud,
|
void * set_tensor_data_ud,
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
std::vector<std::string> & splits,
|
std::vector<std::string> & splits,
|
||||||
|
FILE * file,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool use_direct_io,
|
bool use_direct_io,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
|
@ -658,6 +661,36 @@ llama_model_loader::llama_model_loader(
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||||
}
|
}
|
||||||
|
} else if (file != nullptr) {
|
||||||
|
struct ggml_context * ctx = NULL;
|
||||||
|
struct gguf_init_params params = {
|
||||||
|
/*.no_alloc = */ true,
|
||||||
|
/*.ctx = */ &ctx,
|
||||||
|
};
|
||||||
|
|
||||||
|
metadata_ptr.reset(gguf_init_from_file_ptr(file, params));
|
||||||
|
metadata = metadata_ptr.get();
|
||||||
|
if (metadata == nullptr) {
|
||||||
|
throw std::runtime_error(format("%s: failed to load model from file pointer", __func__));
|
||||||
|
}
|
||||||
|
|
||||||
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||||
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||||
|
|
||||||
|
files.emplace_back(new llama_file(file));
|
||||||
|
contexts.emplace_back(ctx);
|
||||||
|
|
||||||
|
// Save tensors data offset info of the main file.
|
||||||
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
|
std::string tensor_name = std::string(cur->name);
|
||||||
|
// make sure there is no duplicated tensor names
|
||||||
|
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||||
|
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||||
|
}
|
||||||
|
n_elements += ggml_nelements(cur);
|
||||||
|
n_bytes += ggml_nbytes(cur);
|
||||||
|
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||||
|
|
@ -669,7 +702,7 @@ llama_model_loader::llama_model_loader(
|
||||||
fver = (enum llama_fver) gguf_get_version(metadata);
|
fver = (enum llama_fver) gguf_get_version(metadata);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||||
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
|
__func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver));
|
||||||
|
|
||||||
// determine file type based on the number of tensors for each quantization and print meta data
|
// determine file type based on the number of tensors for each quantization and print meta data
|
||||||
// TODO: make optional
|
// TODO: make optional
|
||||||
|
|
@ -726,6 +759,7 @@ llama_model_loader::llama_model_loader(
|
||||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||||
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
|
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
|
||||||
|
case GGML_TYPE_Q1_0: ftype = LLAMA_FTYPE_MOSTLY_Q1_0; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||||
|
|
@ -1127,6 +1161,12 @@ struct ggml_tensor * llama_model_loader::create_tensor(
|
||||||
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
||||||
// when overriding to a CPU buffer, consider the extra buffer types
|
// when overriding to a CPU buffer, consider the extra buffer types
|
||||||
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
|
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
|
||||||
|
if (use_mmap) {
|
||||||
|
static std::once_flag once;
|
||||||
|
std::call_once(once, [] {
|
||||||
|
LLAMA_LOG_WARN("llama_model_loader: tensor overrides to CPU are used with mmap enabled - consider using --no-mmap for better performance\n");
|
||||||
|
});
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
buft = overrides->buft;
|
buft = overrides->buft;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -125,6 +125,7 @@ struct llama_model_loader {
|
||||||
void * set_tensor_data_ud,
|
void * set_tensor_data_ud,
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||||
|
FILE * file,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool use_direct_io,
|
bool use_direct_io,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,9 @@
|
||||||
#include "llama-model-saver.h"
|
#include "llama-model-saver.h"
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
#include "gguf.h"
|
#include "gguf.h"
|
||||||
|
|
||||||
|
#include "llama-arch.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama-hparams.h"
|
#include "llama-hparams.h"
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
|
|
@ -10,8 +12,33 @@
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
bool llama_model_saver_supports_arch(llm_arch arch) {
|
||||||
|
switch (arch) {
|
||||||
|
case LLM_ARCH_QWEN3NEXT:
|
||||||
|
case LLM_ARCH_QWEN35:
|
||||||
|
case LLM_ARCH_QWEN35MOE:
|
||||||
|
case LLM_ARCH_PLAMO3:
|
||||||
|
case LLM_ARCH_GEMMA3:
|
||||||
|
case LLM_ARCH_GEMMA3N:
|
||||||
|
case LLM_ARCH_COHERE2:
|
||||||
|
case LLM_ARCH_OLMO2:
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
case LLM_ARCH_T5:
|
||||||
|
case LLM_ARCH_EXAONE_MOE:
|
||||||
|
case LLM_ARCH_AFMOE:
|
||||||
|
case LLM_ARCH_APERTUS:
|
||||||
|
case LLM_ARCH_MIMO2:
|
||||||
|
case LLM_ARCH_STEP35:
|
||||||
|
return false;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
llama_model_saver::llama_model_saver(const struct llama_model * model) :
|
llama_model_saver::llama_model_saver(const struct llama_model * model) :
|
||||||
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
|
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {
|
||||||
|
GGML_ASSERT(llama_model_saver_supports_arch(model->arch));
|
||||||
|
}
|
||||||
|
|
||||||
llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
|
llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
|
||||||
gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
|
gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
|
||||||
|
|
@ -105,7 +132,10 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
|
if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
|
||||||
GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
|
const std::string tensor_name = tensor->name;
|
||||||
|
GGML_ASSERT(
|
||||||
|
tensor_name == "rope_freqs.weight" || tensor_name == "rope_factors_long.weight" ||
|
||||||
|
tensor_name == "rope_factors_short.weight"); // FIXME
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
gguf_add_tensor(gguf_ctx, tensor);
|
gguf_add_tensor(gguf_ctx, tensor);
|
||||||
|
|
@ -127,6 +157,7 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
tokens[id] = token_data.text;
|
tokens[id] = token_data.text;
|
||||||
scores[id] = token_data.score;
|
scores[id] = token_data.score;
|
||||||
|
|
||||||
|
// FIXME should this be treated as flags?
|
||||||
switch(token_data.attr) {
|
switch(token_data.attr) {
|
||||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||||
|
|
@ -134,6 +165,9 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||||
|
// case LLAMA_TOKEN_ATTR_NORMALIZED: ???
|
||||||
|
// case LLAMA_TOKEN_ATTR_LSTRIP: ???
|
||||||
|
// case LLAMA_TOKEN_ATTR_RSTRIP: ???
|
||||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||||
}
|
}
|
||||||
|
|
@ -144,6 +178,19 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
|
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
|
||||||
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
||||||
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_FILE_TYPE, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_SEQUENCE, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_TOP_K, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_TOP_P, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_MIN_P, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_TEMP, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, ???);
|
||||||
add_kv(LLM_KV_GENERAL_NAME, model->name);
|
add_kv(LLM_KV_GENERAL_NAME, model->name);
|
||||||
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
||||||
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
||||||
|
|
@ -163,17 +210,31 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||||
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||||
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
||||||
|
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
|
||||||
|
add_kv(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp);
|
||||||
|
add_kv(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp);
|
||||||
add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
||||||
// add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
|
// add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
|
||||||
add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||||
add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||||
add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||||
|
add_kv(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups);
|
||||||
|
add_kv(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used);
|
||||||
add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||||
|
add_kv(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm);
|
||||||
|
add_kv(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||||
|
add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
|
||||||
|
add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
||||||
|
add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers);
|
||||||
|
add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers);
|
||||||
|
add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers);
|
||||||
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
||||||
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||||
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
||||||
|
add_kv(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer);
|
||||||
add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
|
add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
|
||||||
|
add_kv(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping);
|
||||||
add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
|
add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
|
||||||
add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
||||||
add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
|
add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
|
||||||
|
|
@ -181,6 +242,9 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
||||||
add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||||||
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||||||
|
add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count);
|
||||||
|
add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
||||||
|
// add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???);
|
||||||
|
|
||||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
||||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||||
|
|
@ -188,22 +252,39 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
|
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
|
||||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
|
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
|
||||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
|
||||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
|
||||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
||||||
|
add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
||||||
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||||
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||||
|
add_kv(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
|
||||||
|
add_kv(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
|
||||||
|
add_kv(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
|
||||||
|
add_kv(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate);
|
||||||
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
||||||
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||||
|
// add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, ???);
|
||||||
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||||||
|
add_kv(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale);
|
||||||
|
add_kv(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length);
|
||||||
|
add_kv(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale);
|
||||||
|
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
|
||||||
|
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
|
||||||
|
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||||
|
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||||
|
add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
|
||||||
|
add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
|
||||||
|
add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
|
||||||
|
|
||||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||||
|
|
||||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
|
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
|
||||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
|
add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
|
||||||
|
add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections);
|
||||||
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
||||||
|
add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||||
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
||||||
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
||||||
add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
|
add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
|
||||||
|
|
@ -211,6 +292,10 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
|
add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
|
||||||
add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
|
add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
|
||||||
add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow);
|
||||||
|
|
||||||
// TODO: implement split file support
|
// TODO: implement split file support
|
||||||
// add_kv(LLM_KV_SPLIT_NO, ???);
|
// add_kv(LLM_KV_SPLIT_NO, ???);
|
||||||
|
|
@ -221,8 +306,11 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||||
add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||||||
add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||||
|
add_kv(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
||||||
add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
|
add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
|
||||||
|
|
||||||
add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
||||||
|
|
||||||
add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
|
add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
|
||||||
|
|
@ -260,15 +348,39 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
// TODO: implement LoRA support
|
// TODO: implement LoRA support
|
||||||
// add_kv(LLM_KV_ADAPTER_TYPE, ???);
|
// add_kv(LLM_KV_ADAPTER_TYPE, ???);
|
||||||
// add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
|
// add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
|
||||||
|
// add_kv(LLM_KV_ADAPTER_LORA_TASK_NAME, ???);
|
||||||
|
// add_kv(LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, ???);
|
||||||
|
// add_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, ???);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
|
||||||
|
add_kv(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
|
||||||
|
add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_CLASSIFIER_OUTPUT_LABELS, model->classifier_labels);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n);
|
||||||
|
add_kv(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p);
|
||||||
|
add_kv(LLM_KV_XIELU_BETA, hparams.xielu_beta);
|
||||||
|
add_kv(LLM_KV_XIELU_EPS, hparams.xielu_eps);
|
||||||
|
|
||||||
// deprecated
|
// deprecated
|
||||||
// add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
|
// add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
|
||||||
// add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
|
// add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
|
||||||
// add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
|
// add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in);
|
||||||
|
add_kv(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out);
|
||||||
|
add_kv(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in);
|
||||||
|
add_kv(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_model_saver::add_tensors_from_model() {
|
void llama_model_saver::add_tensors_from_model() {
|
||||||
if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
|
if (model->output != nullptr &&
|
||||||
|
std::string(model->output->name) != std::string(model->tok_embd->name)) {
|
||||||
add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
|
add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
|
||||||
}
|
}
|
||||||
add_tensor(model->type_embd);
|
add_tensor(model->type_embd);
|
||||||
|
|
@ -297,3 +409,6 @@ void llama_model_saver::save(const std::string & path_model) {
|
||||||
gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
|
gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::save(FILE * file) {
|
||||||
|
gguf_write_to_file_ptr(gguf_ctx, file, false);
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,9 @@
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
// FIXME temporary function for better error messages
|
||||||
|
bool llama_model_saver_supports_arch(llm_arch arch);
|
||||||
|
|
||||||
struct llama_model_saver {
|
struct llama_model_saver {
|
||||||
struct gguf_context * gguf_ctx = nullptr;
|
struct gguf_context * gguf_ctx = nullptr;
|
||||||
const bool gguf_ctx_owned;
|
const bool gguf_ctx_owned;
|
||||||
|
|
@ -37,4 +40,5 @@ struct llama_model_saver {
|
||||||
void add_tensors_from_model();
|
void add_tensors_from_model();
|
||||||
|
|
||||||
void save(const std::string & path_model);
|
void save(const std::string & path_model);
|
||||||
|
void save(FILE * file);
|
||||||
};
|
};
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -84,6 +84,7 @@ enum llm_type {
|
||||||
LLM_TYPE_26B,
|
LLM_TYPE_26B,
|
||||||
LLM_TYPE_27B,
|
LLM_TYPE_27B,
|
||||||
LLM_TYPE_30B,
|
LLM_TYPE_30B,
|
||||||
|
LLM_TYPE_31B,
|
||||||
LLM_TYPE_32B,
|
LLM_TYPE_32B,
|
||||||
LLM_TYPE_34B,
|
LLM_TYPE_34B,
|
||||||
LLM_TYPE_35B,
|
LLM_TYPE_35B,
|
||||||
|
|
@ -118,6 +119,7 @@ enum llm_type {
|
||||||
LLM_TYPE_16B_A1B,
|
LLM_TYPE_16B_A1B,
|
||||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||||
LLM_TYPE_24B_A2B, // lfm2moe
|
LLM_TYPE_24B_A2B, // lfm2moe
|
||||||
|
LLM_TYPE_26B_A4B, // Gemma4
|
||||||
LLM_TYPE_30B_A3B,
|
LLM_TYPE_30B_A3B,
|
||||||
LLM_TYPE_31B_A3_5B,
|
LLM_TYPE_31B_A3_5B,
|
||||||
LLM_TYPE_35B_A3B, // Qwen3.5
|
LLM_TYPE_35B_A3B, // Qwen3.5
|
||||||
|
|
@ -244,6 +246,8 @@ struct llama_layer {
|
||||||
struct ggml_tensor * wkv_b = nullptr;
|
struct ggml_tensor * wkv_b = nullptr;
|
||||||
struct ggml_tensor * wk_b = nullptr;
|
struct ggml_tensor * wk_b = nullptr;
|
||||||
struct ggml_tensor * wv_b = nullptr;
|
struct ggml_tensor * wv_b = nullptr;
|
||||||
|
struct ggml_tensor * wqkv_b = nullptr;
|
||||||
|
struct ggml_tensor * wo_b = nullptr;
|
||||||
struct ggml_tensor * wq_cross = nullptr;
|
struct ggml_tensor * wq_cross = nullptr;
|
||||||
struct ggml_tensor * wk_cross = nullptr;
|
struct ggml_tensor * wk_cross = nullptr;
|
||||||
struct ggml_tensor * wv_cross = nullptr;
|
struct ggml_tensor * wv_cross = nullptr;
|
||||||
|
|
@ -254,13 +258,6 @@ struct llama_layer {
|
||||||
struct ggml_tensor * wo_enc = nullptr;
|
struct ggml_tensor * wo_enc = nullptr;
|
||||||
struct ggml_tensor * wqkv_gate = nullptr;
|
struct ggml_tensor * wqkv_gate = nullptr;
|
||||||
|
|
||||||
// attention bias
|
|
||||||
struct ggml_tensor * bq = nullptr;
|
|
||||||
struct ggml_tensor * bk = nullptr;
|
|
||||||
struct ggml_tensor * bv = nullptr;
|
|
||||||
struct ggml_tensor * bo = nullptr;
|
|
||||||
struct ggml_tensor * bqkv = nullptr;
|
|
||||||
|
|
||||||
// relative position bias
|
// relative position bias
|
||||||
struct ggml_tensor * attn_rel_b = nullptr;
|
struct ggml_tensor * attn_rel_b = nullptr;
|
||||||
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
||||||
|
|
@ -270,6 +267,9 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_norm = nullptr;
|
struct ggml_tensor * ffn_norm = nullptr;
|
||||||
struct ggml_tensor * ffn_norm_b = nullptr;
|
struct ggml_tensor * ffn_norm_b = nullptr;
|
||||||
struct ggml_tensor * ffn_post_norm = nullptr;
|
struct ggml_tensor * ffn_post_norm = nullptr;
|
||||||
|
struct ggml_tensor * ffn_post_norm_1 = nullptr; // gemma4
|
||||||
|
struct ggml_tensor * ffn_post_norm_2 = nullptr; // gemma4
|
||||||
|
struct ggml_tensor * ffn_pre_norm_2 = nullptr; // gemma4
|
||||||
struct ggml_tensor * layer_out_norm = nullptr;
|
struct ggml_tensor * layer_out_norm = nullptr;
|
||||||
struct ggml_tensor * layer_out_norm_b = nullptr;
|
struct ggml_tensor * layer_out_norm_b = nullptr;
|
||||||
struct ggml_tensor * ffn_norm_exps = nullptr;
|
struct ggml_tensor * ffn_norm_exps = nullptr;
|
||||||
|
|
@ -285,6 +285,7 @@ struct llama_layer {
|
||||||
|
|
||||||
// ff MoE
|
// ff MoE
|
||||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||||
|
struct ggml_tensor * ffn_gate_inp_s = nullptr; // gemma4
|
||||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||||
|
|
@ -409,10 +410,32 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_gate_shexp_s = nullptr;
|
struct ggml_tensor * ffn_gate_shexp_s = nullptr;
|
||||||
struct ggml_tensor * ffn_up_shexp_s = nullptr;
|
struct ggml_tensor * ffn_up_shexp_s = nullptr;
|
||||||
struct ggml_tensor * ffn_down_shexp_s = nullptr;
|
struct ggml_tensor * ffn_down_shexp_s = nullptr;
|
||||||
struct ggml_tensor * ssm_out_s = nullptr;
|
struct ggml_tensor * ssm_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ssm_out_s = nullptr;
|
||||||
struct ggml_tensor * ssm_alpha_s = nullptr;
|
struct ggml_tensor * ssm_alpha_s = nullptr;
|
||||||
struct ggml_tensor * ssm_beta_s = nullptr;
|
struct ggml_tensor * ssm_beta_s = nullptr;
|
||||||
|
|
||||||
|
// input scales
|
||||||
|
struct ggml_tensor * wq_in_s = nullptr;
|
||||||
|
struct ggml_tensor * wk_in_s = nullptr;
|
||||||
|
struct ggml_tensor * wv_in_s = nullptr;
|
||||||
|
struct ggml_tensor * wo_in_s = nullptr;
|
||||||
|
struct ggml_tensor * wqkv_in_s = nullptr;
|
||||||
|
struct ggml_tensor * wqkv_gate_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ffn_gate_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ffn_up_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ffn_down_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ffn_gate_exps_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ffn_down_exps_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ffn_up_exps_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ffn_gate_shexp_in_s= nullptr;
|
||||||
|
struct ggml_tensor * ffn_up_shexp_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ffn_down_shexp_in_s= nullptr;
|
||||||
|
struct ggml_tensor * ssm_in_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ssm_out_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ssm_alpha_in_s = nullptr;
|
||||||
|
struct ggml_tensor * ssm_beta_in_s = nullptr;
|
||||||
|
|
||||||
// altup & laurel
|
// altup & laurel
|
||||||
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
||||||
struct ggml_tensor * per_layer_proj = nullptr;
|
struct ggml_tensor * per_layer_proj = nullptr;
|
||||||
|
|
@ -461,6 +484,9 @@ struct llama_layer {
|
||||||
struct ggml_tensor * indexer_attn_k = nullptr;
|
struct ggml_tensor * indexer_attn_k = nullptr;
|
||||||
struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
|
struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
|
||||||
|
|
||||||
|
// gemma4 layer output scale
|
||||||
|
struct ggml_tensor * out_scale = nullptr;
|
||||||
|
|
||||||
struct llama_layer_posnet posnet;
|
struct llama_layer_posnet posnet;
|
||||||
|
|
||||||
struct llama_layer_convnext convnext;
|
struct llama_layer_convnext convnext;
|
||||||
|
|
@ -470,6 +496,19 @@ struct llama_layer {
|
||||||
struct llama_layer_nextn nextn;
|
struct llama_layer_nextn nextn;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llama_device {
|
||||||
|
bool is_meta;
|
||||||
|
|
||||||
|
ggml_backend_dev_t dev;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_meta_device_get_split_state_userdata {
|
||||||
|
size_t n_devices;
|
||||||
|
const struct llama_model * model;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const struct ggml_tensor * tensor, void * userdata);
|
||||||
|
|
||||||
struct llama_model {
|
struct llama_model {
|
||||||
llm_type type = LLM_TYPE_UNKNOWN;
|
llm_type type = LLM_TYPE_UNKNOWN;
|
||||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||||
|
|
@ -505,9 +544,9 @@ struct llama_model {
|
||||||
struct ggml_tensor * conv1d_b = nullptr;
|
struct ggml_tensor * conv1d_b = nullptr;
|
||||||
|
|
||||||
// gemma3n altup
|
// gemma3n altup
|
||||||
struct ggml_tensor * tok_embd_per_layer = nullptr;
|
|
||||||
struct ggml_tensor * altup_proj = nullptr;
|
struct ggml_tensor * altup_proj = nullptr;
|
||||||
struct ggml_tensor * altup_unembd_proj = nullptr;
|
struct ggml_tensor * altup_unembd_proj = nullptr;
|
||||||
|
struct ggml_tensor * per_layer_tok_embd = nullptr;
|
||||||
struct ggml_tensor * per_layer_model_proj = nullptr;
|
struct ggml_tensor * per_layer_model_proj = nullptr;
|
||||||
struct ggml_tensor * per_layer_proj_norm = nullptr;
|
struct ggml_tensor * per_layer_proj_norm = nullptr;
|
||||||
|
|
||||||
|
|
@ -524,7 +563,7 @@ struct llama_model {
|
||||||
std::unordered_map<std::string, std::string> gguf_kv;
|
std::unordered_map<std::string, std::string> gguf_kv;
|
||||||
|
|
||||||
// list of devices used in this model
|
// list of devices used in this model
|
||||||
std::vector<ggml_backend_dev_t> devices;
|
std::vector<llama_device> devices;
|
||||||
|
|
||||||
// for quantize-stats only
|
// for quantize-stats only
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||||
|
|
@ -532,6 +571,9 @@ struct llama_model {
|
||||||
// for keeping track of associated LoRA adapters
|
// for keeping track of associated LoRA adapters
|
||||||
std::unordered_set<llama_adapter_lora *> loras;
|
std::unordered_set<llama_adapter_lora *> loras;
|
||||||
|
|
||||||
|
// statically allocated context for assigning
|
||||||
|
struct llama_meta_device_get_split_state_userdata get_split_state_ud;
|
||||||
|
|
||||||
int64_t t_load_us = 0;
|
int64_t t_load_us = 0;
|
||||||
int64_t t_start_us = 0;
|
int64_t t_start_us = 0;
|
||||||
|
|
||||||
|
|
@ -552,6 +594,7 @@ struct llama_model {
|
||||||
size_t size() const; // file size
|
size_t size() const; // file size
|
||||||
size_t n_tensors() const;
|
size_t n_tensors() const;
|
||||||
size_t n_devices() const;
|
size_t n_devices() const;
|
||||||
|
const float * tensor_split() const;
|
||||||
|
|
||||||
uint32_t n_gpu_layers() const;
|
uint32_t n_gpu_layers() const;
|
||||||
llama_split_mode split_mode() const;
|
llama_split_mode split_mode() const;
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,11 @@
|
||||||
#include "llama.h"
|
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
|
#include "llama-ext.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <string>
|
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
@ -84,7 +84,6 @@ static std::string remap_imatrix(const std::string & orig_name, const std::map<i
|
||||||
|
|
||||||
for (const auto & p : mapped) {
|
for (const auto & p : mapped) {
|
||||||
if (p.second == blk) {
|
if (p.second == blk) {
|
||||||
LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
|
|
||||||
return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
|
return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -188,10 +187,9 @@ struct quantize_state_impl {
|
||||||
model(model), params(params)
|
model(model), params(params)
|
||||||
{
|
{
|
||||||
// compile regex patterns once - they are expensive
|
// compile regex patterns once - they are expensive
|
||||||
if (params->tensor_types) {
|
if (params->tt_overrides) {
|
||||||
const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
|
for (const auto * p = params->tt_overrides; p->pattern != nullptr; p++) {
|
||||||
for (const auto & [tname, qtype] : tensor_types) {
|
tensor_type_patterns.emplace_back(std::regex(p->pattern), p->type);
|
||||||
tensor_type_patterns.emplace_back(std::regex(tname), qtype);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -199,6 +197,7 @@ struct quantize_state_impl {
|
||||||
|
|
||||||
// per-tensor metadata, computed in the preliminary loop and used in the main loop
|
// per-tensor metadata, computed in the preliminary loop and used in the main loop
|
||||||
struct tensor_metadata {
|
struct tensor_metadata {
|
||||||
|
std::string name;
|
||||||
ggml_type target_type;
|
ggml_type target_type;
|
||||||
tensor_category category;
|
tensor_category category;
|
||||||
std::string remapped_imatrix_name;
|
std::string remapped_imatrix_name;
|
||||||
|
|
@ -344,7 +343,13 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
|
||||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||||
|
|
||||||
// do not quantize specific multimodal tensors
|
// do not quantize specific multimodal tensors
|
||||||
quantize &= name.find(".position_embd.") == std::string::npos;
|
quantize &= name.find(".position_embd") == std::string::npos;
|
||||||
|
quantize &= name.find("sam.pos_embd") == std::string::npos;
|
||||||
|
quantize &= name.find("sam.neck.") == std::string::npos;
|
||||||
|
quantize &= name.find("sam.net_") == std::string::npos;
|
||||||
|
quantize &= name.find(".rel_pos") == std::string::npos;
|
||||||
|
quantize &= name.find(".patch_embd") == std::string::npos;
|
||||||
|
quantize &= name.find(".patch_merger") == std::string::npos;
|
||||||
|
|
||||||
return quantize;
|
return quantize;
|
||||||
}
|
}
|
||||||
|
|
@ -678,9 +683,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_mod
|
||||||
LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
|
LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
|
||||||
__func__, tensor_name.c_str(), ggml_type_name(new_type), ggml_type_name(qtype));
|
__func__, tensor_name.c_str(), ggml_type_name(new_type), ggml_type_name(qtype));
|
||||||
new_type = qtype;
|
new_type = qtype;
|
||||||
manual = true;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
manual = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -784,7 +789,7 @@ static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type ds
|
||||||
// given a file type, get the default tensor type
|
// given a file type, get the default tensor type
|
||||||
//
|
//
|
||||||
|
|
||||||
static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
||||||
switch (ftype) {
|
switch (ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
|
||||||
|
|
@ -794,6 +799,7 @@ static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_F16: return GGML_TYPE_F16;
|
case LLAMA_FTYPE_MOSTLY_F16: return GGML_TYPE_F16;
|
||||||
case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
|
case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
|
||||||
case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32;
|
case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q1_0: return GGML_TYPE_Q1_0;
|
||||||
|
|
||||||
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
|
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
|
||||||
|
|
||||||
|
|
@ -823,16 +829,32 @@ static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
|
||||||
|
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: return GGML_TYPE_COUNT;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
|
||||||
|
for (auto & tm : metadata) {
|
||||||
|
tensor_category cat = tensor_get_category(tm.name);
|
||||||
|
tm.category = cat;
|
||||||
|
|
||||||
|
if (category_is_attn_v(cat)) {
|
||||||
|
++qs.n_attention_wv;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cat == tensor_category::OUTPUT) {
|
||||||
|
qs.has_tied_embeddings = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// main quantization driver
|
// main quantization driver
|
||||||
//
|
//
|
||||||
|
|
||||||
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||||
ggml_type default_type;
|
|
||||||
llama_ftype ftype = params->ftype;
|
llama_ftype ftype = params->ftype;
|
||||||
|
|
||||||
int nthread = params->nthread;
|
int nthread = params->nthread;
|
||||||
|
|
@ -841,7 +863,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
nthread = std::thread::hardware_concurrency();
|
nthread = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
|
||||||
default_type = llama_ftype_get_default_type(ftype);
|
ggml_type default_type = llama_ftype_get_default_type(ftype);
|
||||||
|
if (default_type == GGML_TYPE_COUNT) {
|
||||||
|
throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
|
}
|
||||||
|
|
||||||
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
||||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||||
|
|
@ -851,15 +876,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
constexpr bool use_mmap = false;
|
constexpr bool use_mmap = false;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
llama_model_kv_override * kv_overrides = nullptr;
|
const llama_model_kv_override * kv_overrides = params->kv_overrides;
|
||||||
if (params->kv_overrides) {
|
|
||||||
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
|
||||||
kv_overrides = v->data();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> splits = {};
|
std::vector<std::string> splits = {};
|
||||||
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
||||||
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||||
ml.init_mappings(false); // no prefetching
|
ml.init_mappings(false); // no prefetching
|
||||||
|
|
||||||
llama_model model(llama_model_default_params());
|
llama_model model(llama_model_default_params());
|
||||||
|
|
@ -873,9 +893,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
if (params->only_copy) {
|
if (params->only_copy) {
|
||||||
ftype = ml.ftype;
|
ftype = ml.ftype;
|
||||||
}
|
}
|
||||||
|
std::unordered_map<std::string, std::vector<float>> i_data;
|
||||||
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
||||||
if (params->imatrix) {
|
if (params->imatrix) {
|
||||||
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
for (const llama_model_imatrix_data * p = params->imatrix; p->name != nullptr; p++) {
|
||||||
|
i_data.emplace(p->name, std::vector<float>(p->data, p->data + p->size));
|
||||||
|
}
|
||||||
|
imatrix_data = & i_data;
|
||||||
if (imatrix_data) {
|
if (imatrix_data) {
|
||||||
LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
|
LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
|
||||||
__func__, (int)imatrix_data->size());
|
__func__, (int)imatrix_data->size());
|
||||||
|
|
@ -896,7 +920,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
|
|
||||||
std::vector<int> prune_list = {};
|
std::vector<int> prune_list = {};
|
||||||
if (params->prune_layers) {
|
if (params->prune_layers) {
|
||||||
prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
|
for (const int32_t * p = params->prune_layers; * p != -1; p++) {
|
||||||
|
prune_list.push_back(* p);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy the KV pairs from the input file
|
// copy the KV pairs from the input file
|
||||||
|
|
@ -910,20 +936,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
||||||
|
|
||||||
if (params->kv_overrides) {
|
if (params->kv_overrides) {
|
||||||
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
for (const llama_model_kv_override * o = params->kv_overrides; o->key[0] != 0; ++o) {
|
||||||
for (const auto & o : overrides) {
|
if (o->tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
||||||
if (o.key[0] == 0) break;
|
gguf_set_val_f32(ctx_out.get(), o->key, o->val_f64);
|
||||||
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
||||||
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
|
|
||||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
|
||||||
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
|
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
|
||||||
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)std::abs(o.val_i64));
|
gguf_set_val_u32(ctx_out.get(), o->key, (uint32_t)std::abs(o->val_i64));
|
||||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
||||||
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
|
gguf_set_val_bool(ctx_out.get(), o->key, o->val_bool);
|
||||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
||||||
gguf_set_val_str(ctx_out.get(), o.key, o.val_str);
|
gguf_set_val_str(ctx_out.get(), o->key, o->val_str);
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o->key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -961,6 +985,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// compute tensor metadata once and cache it
|
||||||
|
std::vector<tensor_metadata> metadata(tensors.size());
|
||||||
|
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||||
|
metadata[i].name = ggml_get_name(tensors[i]->tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize quantization state counters and metadata categories
|
||||||
|
init_quantize_state_counters(qs, metadata);
|
||||||
|
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
uint16_t n_split = 1;
|
uint16_t n_split = 1;
|
||||||
|
|
||||||
|
|
@ -973,25 +1006,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
std::vector<gguf_context_ptr> ctx_outs(n_split);
|
std::vector<gguf_context_ptr> ctx_outs(n_split);
|
||||||
ctx_outs[0] = std::move(ctx_out);
|
ctx_outs[0] = std::move(ctx_out);
|
||||||
|
|
||||||
// compute tensor metadata once and cache it
|
|
||||||
std::vector<tensor_metadata> metadata(tensors.size());
|
|
||||||
|
|
||||||
// initialize quantization state before preliminary loop (counters for use_more_bits)
|
|
||||||
{
|
|
||||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
||||||
const auto cat = tensor_get_category(tensors[i]->tensor->name);
|
|
||||||
if (category_is_attn_v(cat)) {
|
|
||||||
++qs.n_attention_wv;
|
|
||||||
}
|
|
||||||
if (cat == tensor_category::OUTPUT) {
|
|
||||||
qs.has_tied_embeddings = false;
|
|
||||||
}
|
|
||||||
metadata[i].category = cat; // save and re-use the category while we're at it
|
|
||||||
}
|
|
||||||
// these also need to be set to n_layer by default
|
|
||||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
|
|
||||||
}
|
|
||||||
|
|
||||||
// flag for --dry-run
|
// flag for --dry-run
|
||||||
bool will_require_imatrix = false;
|
bool will_require_imatrix = false;
|
||||||
|
|
||||||
|
|
@ -1002,7 +1016,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||||
const auto * it = tensors[i];
|
const auto * it = tensors[i];
|
||||||
const struct ggml_tensor * tensor = it->tensor;
|
const struct ggml_tensor * tensor = it->tensor;
|
||||||
const std::string name = ggml_get_name(tensor);
|
|
||||||
|
|
||||||
uint16_t i_split = params->keep_split ? it->idx : 0;
|
uint16_t i_split = params->keep_split ? it->idx : 0;
|
||||||
if (!ctx_outs[i_split]) {
|
if (!ctx_outs[i_split]) {
|
||||||
|
|
@ -1031,7 +1044,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
" - offending tensor: %s\n"
|
" - offending tensor: %s\n"
|
||||||
" - target type: %s\n"
|
" - target type: %s\n"
|
||||||
"============================================================================\n\n",
|
"============================================================================\n\n",
|
||||||
name.c_str(), ggml_type_name(metadata[i].target_type));
|
metadata[i].name.c_str(), ggml_type_name(metadata[i].target_type));
|
||||||
throw std::runtime_error("this quantization requires an imatrix!");
|
throw std::runtime_error("this quantization requires an imatrix!");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1104,7 +1117,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
new_ofstream(weight.idx);
|
new_ofstream(weight.idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string name = ggml_get_name(tensor);
|
|
||||||
const size_t tensor_size = ggml_nbytes(tensor);
|
const size_t tensor_size = ggml_nbytes(tensor);
|
||||||
|
|
||||||
if (!params->dry_run) {
|
if (!params->dry_run) {
|
||||||
|
|
@ -1235,9 +1247,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
total_size_new += new_size;
|
total_size_new += new_size;
|
||||||
|
|
||||||
// update the gguf meta data as we go
|
// update the gguf meta data as we go
|
||||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
gguf_set_tensor_type(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_type);
|
||||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), metadata[i].name.c_str())) == new_size);
|
||||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
gguf_set_tensor_data(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_data);
|
||||||
|
|
||||||
// write tensor data + padding
|
// write tensor data + padding
|
||||||
fout.write((const char *) new_data, new_size);
|
fout.write((const char *) new_data, new_size);
|
||||||
|
|
@ -1271,7 +1283,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
llama_model_quantize_params llama_model_quantize_default_params() {
|
llama_model_quantize_params llama_model_quantize_default_params() {
|
||||||
llama_model_quantize_params result = {
|
llama_model_quantize_params result = {
|
||||||
/*.nthread =*/ 0,
|
/*.nthread =*/ 0,
|
||||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q8_0,
|
||||||
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
||||||
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
||||||
/*.allow_requantize =*/ false,
|
/*.allow_requantize =*/ false,
|
||||||
|
|
@ -1302,3 +1314,89 @@ uint32_t llama_model_quantize(
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Helper functions for external tools exposed in llama-ext.h
|
||||||
|
//
|
||||||
|
|
||||||
|
quantize_state_impl * llama_quant_init(
|
||||||
|
const llama_model * model,
|
||||||
|
const llama_model_quantize_params * params) {
|
||||||
|
return new quantize_state_impl(*model, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_quant_free(quantize_state_impl * qs) {
|
||||||
|
delete qs;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc) {
|
||||||
|
struct llama_model_params mparams = llama_model_default_params();
|
||||||
|
auto * model = new llama_model(mparams);
|
||||||
|
|
||||||
|
model->arch = llm_arch_from_string(desc->architecture);
|
||||||
|
|
||||||
|
// infer llm_type: only LLM_TYPE_70B matters for quantization logic
|
||||||
|
if (model->arch == LLM_ARCH_LLAMA && desc->n_layer == 80 && desc->n_head != desc->n_head_kv) {
|
||||||
|
model->type = LLM_TYPE_70B;
|
||||||
|
}
|
||||||
|
|
||||||
|
model->hparams.n_embd = desc->n_embd;
|
||||||
|
model->hparams.n_embd_head_k_full = desc->n_embd_head_k;
|
||||||
|
model->hparams.n_embd_head_v_full = desc->n_embd_head_v;
|
||||||
|
model->hparams.n_layer = desc->n_layer;
|
||||||
|
model->hparams.n_expert = desc->n_expert;
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < desc->n_layer; i++) {
|
||||||
|
model->hparams.n_head_arr[i] = desc->n_head;
|
||||||
|
model->hparams.n_head_kv_arr[i] = desc->n_head_kv;
|
||||||
|
model->hparams.n_ff_arr[i] = desc->n_ff;
|
||||||
|
}
|
||||||
|
|
||||||
|
return model;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_quant_tensor_allows_quantization(
|
||||||
|
const quantize_state_impl * qs,
|
||||||
|
const ggml_tensor * tensor) {
|
||||||
|
return tensor_allows_quantization(qs->params, qs->model.arch, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_quant_compute_types(
|
||||||
|
quantize_state_impl * qs,
|
||||||
|
llama_ftype ftype,
|
||||||
|
ggml_tensor ** tensors,
|
||||||
|
ggml_type * result_types,
|
||||||
|
size_t n_tensors) {
|
||||||
|
// reset per-computation state
|
||||||
|
qs->n_attention_wv = 0;
|
||||||
|
qs->n_ffn_down = 0;
|
||||||
|
qs->n_ffn_gate = 0;
|
||||||
|
qs->n_ffn_up = 0;
|
||||||
|
qs->i_attention_wv = 0;
|
||||||
|
qs->i_ffn_down = 0;
|
||||||
|
qs->i_ffn_gate = 0;
|
||||||
|
qs->i_ffn_up = 0;
|
||||||
|
qs->n_fallback = 0;
|
||||||
|
qs->has_imatrix = false;
|
||||||
|
qs->has_tied_embeddings = true;
|
||||||
|
|
||||||
|
// build metadata from tensor names
|
||||||
|
std::vector<tensor_metadata> metadata(n_tensors);
|
||||||
|
for (size_t i = 0; i < n_tensors; i++) {
|
||||||
|
metadata[i].name = ggml_get_name(tensors[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize counters and categories
|
||||||
|
init_quantize_state_counters(*qs, metadata);
|
||||||
|
|
||||||
|
// use a local copy of params with the requested ftype
|
||||||
|
llama_model_quantize_params local_params = *qs->params;
|
||||||
|
local_params.ftype = ftype;
|
||||||
|
|
||||||
|
ggml_type default_type = llama_ftype_get_default_type(ftype);
|
||||||
|
|
||||||
|
// compute types
|
||||||
|
for (size_t i = 0; i < n_tensors; i++) {
|
||||||
|
result_types[i] = llama_tensor_get_type(*qs, &local_params, tensors[i], default_type, metadata[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -493,6 +493,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
|
||||||
|
// Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
|
||||||
|
// normalizer, then BPE merges run on the whole text without
|
||||||
|
// word-level pre-splitting. We only need to split on newlines
|
||||||
|
// since BPE merge lookup asserts no newlines in tokens.
|
||||||
|
regex_exprs = {
|
||||||
|
"[^\\n]+|[\\n]+",
|
||||||
|
};
|
||||||
|
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
|
|
@ -506,6 +516,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> regex_exprs;
|
std::vector<std::string> regex_exprs;
|
||||||
|
bool byte_encode = true; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_tokenizer_bpe_session {
|
struct llm_tokenizer_bpe_session {
|
||||||
|
|
@ -550,9 +561,10 @@ struct llm_tokenizer_bpe_session {
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||||
int final_prev_index = -1;
|
int final_prev_index = -1;
|
||||||
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
|
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
|
||||||
|
|
||||||
symbols_final.clear();
|
symbols_final.clear();
|
||||||
|
auto tok_pre = vocab.get_pre_type();
|
||||||
|
|
||||||
for (const auto & word : word_collection) {
|
for (const auto & word : word_collection) {
|
||||||
work_queue = llm_bigram_bpe::queue();
|
work_queue = llm_bigram_bpe::queue();
|
||||||
|
|
@ -565,6 +577,13 @@ struct llm_tokenizer_bpe_session {
|
||||||
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
|
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
|
||||||
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
||||||
offset = word.size();
|
offset = word.size();
|
||||||
|
} else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of('\n') == std::string::npos) {
|
||||||
|
// fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
|
||||||
|
auto tok = vocab.text_to_token(word);
|
||||||
|
if (tok != LLAMA_TOKEN_NULL) {
|
||||||
|
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
||||||
|
offset = word.size();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while (offset < word.size()) {
|
while (offset < word.size()) {
|
||||||
|
|
@ -640,8 +659,17 @@ struct llm_tokenizer_bpe_session {
|
||||||
|
|
||||||
if (token == LLAMA_TOKEN_NULL) {
|
if (token == LLAMA_TOKEN_NULL) {
|
||||||
for (auto j = str.begin(); j != str.end(); ++j) {
|
for (auto j = str.begin(); j != str.end(); ++j) {
|
||||||
std::string byte_str(1, *j);
|
llama_token token_multibyte = LLAMA_TOKEN_NULL;
|
||||||
auto token_multibyte = vocab.text_to_token(byte_str);
|
if (tokenizer.byte_encode) {
|
||||||
|
std::string byte_str(1, *j);
|
||||||
|
token_multibyte = vocab.text_to_token(byte_str);
|
||||||
|
} else {
|
||||||
|
// For non-byte-encoded BPE (e.g. gemma-4), byte tokens use <0xXX> format
|
||||||
|
static const char * hex = "0123456789ABCDEF";
|
||||||
|
const uint8_t ch = (uint8_t)*j;
|
||||||
|
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
||||||
|
token_multibyte = vocab.text_to_token(buf);
|
||||||
|
}
|
||||||
if (token_multibyte != LLAMA_TOKEN_NULL) {
|
if (token_multibyte != LLAMA_TOKEN_NULL) {
|
||||||
output.push_back(token_multibyte);
|
output.push_back(token_multibyte);
|
||||||
}
|
}
|
||||||
|
|
@ -1863,6 +1891,42 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
special_sep_id = LLAMA_TOKEN_NULL;
|
special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
special_pad_id = 3; // <|plamo:pad|>
|
special_pad_id = 3; // <|plamo:pad|>
|
||||||
special_mask_id = LLAMA_TOKEN_NULL;
|
special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
|
} else if (tokenizer_model == "gemma4") {
|
||||||
|
type = LLAMA_VOCAB_TYPE_BPE;
|
||||||
|
|
||||||
|
// read bpe merges and populate bpe ranks
|
||||||
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
||||||
|
if (merges_keyidx == -1) {
|
||||||
|
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
||||||
|
for (int i = 0; i < n_merges; i++) {
|
||||||
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
||||||
|
|
||||||
|
std::string first;
|
||||||
|
std::string second;
|
||||||
|
|
||||||
|
const size_t pos = word.find(' ', 1);
|
||||||
|
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
first = word.substr(0, pos);
|
||||||
|
second = word.substr(pos + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
bpe_ranks.emplace(std::make_pair(first, second), i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// default special tokens (to be read from GGUF)
|
||||||
|
special_bos_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_eos_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_unk_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
|
tokenizer_pre = "gemma4";
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||||
}
|
}
|
||||||
|
|
@ -1870,6 +1934,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
// for now, only BPE models have pre-tokenizers
|
// for now, only BPE models have pre-tokenizers
|
||||||
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
||||||
add_space_prefix = false;
|
add_space_prefix = false;
|
||||||
|
escape_whitespaces = false;
|
||||||
clean_spaces = true;
|
clean_spaces = true;
|
||||||
if (tokenizer_pre.empty()) {
|
if (tokenizer_pre.empty()) {
|
||||||
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
|
|
@ -1936,6 +2001,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "jais-2") {
|
tokenizer_pre == "jais-2") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "gemma4") {
|
||||||
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
|
||||||
|
escape_whitespaces = true;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "jina-v1-en" ||
|
tokenizer_pre == "jina-v1-en" ||
|
||||||
tokenizer_pre == "jina-v2-code" ||
|
tokenizer_pre == "jina-v2-code" ||
|
||||||
|
|
@ -1952,7 +2021,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "qwen2" ||
|
tokenizer_pre == "qwen2" ||
|
||||||
tokenizer_pre == "deepseek-r1-qwen" ||
|
tokenizer_pre == "deepseek-r1-qwen" ||
|
||||||
tokenizer_pre == "kormo") {
|
tokenizer_pre == "kormo" ||
|
||||||
|
tokenizer_pre == "f2llmv2") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
|
|
@ -2129,19 +2199,28 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
|
||||||
|
|
||||||
const float * scores = nullptr;
|
const float * scores = nullptr;
|
||||||
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||||||
if (score_idx != -1) {
|
if (score_idx != -1) {
|
||||||
|
const uint32_t n_scores = gguf_get_arr_n(ctx, score_idx);
|
||||||
|
if (n_scores < n_tokens) {
|
||||||
|
throw std::runtime_error("Index out of array bounds for scores (" + std::to_string(n_scores) + " < " + std::to_string(n_tokens) + ")\n");
|
||||||
|
}
|
||||||
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int * toktypes = nullptr;
|
const int * toktypes = nullptr;
|
||||||
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||||||
if (toktype_idx != -1) {
|
if (toktype_idx != -1) {
|
||||||
|
const uint32_t n_toktypes = gguf_get_arr_n(ctx, toktype_idx);
|
||||||
|
if (n_toktypes < n_tokens) {
|
||||||
|
throw std::runtime_error("Index out of array bounds for toktypes (" + std::to_string(n_toktypes) + " < " + std::to_string(n_tokens) + ")\n");
|
||||||
|
}
|
||||||
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
|
|
||||||
id_to_token.resize(n_tokens);
|
id_to_token.resize(n_tokens);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
|
|
@ -2255,6 +2334,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
|
||||||
add_sep = temp;
|
add_sep = temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// workaround for Gemma 4
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/21500
|
||||||
|
if (pre_type == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && !add_bos) {
|
||||||
|
add_bos = true;
|
||||||
|
|
||||||
|
LLAMA_LOG_WARN("%s: override '%s' to 'true' for Gemma4\n", __func__, kv(LLM_KV_TOKENIZER_ADD_BOS).c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// auto-detect special tokens by text
|
// auto-detect special tokens by text
|
||||||
|
|
@ -2480,6 +2567,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
|| t.first == "[EOS]" // Kimi-K2
|
|| t.first == "[EOS]" // Kimi-K2
|
||||||
|| t.first == "<|end_of_text|>"
|
|| t.first == "<|end_of_text|>"
|
||||||
|| t.first == "<end_of_utterance>" // smoldocling
|
|| t.first == "<end_of_utterance>" // smoldocling
|
||||||
|
|| t.first == "<eos>" // gemma4
|
||||||
|
|| t.first == "<turn|>" // gemma4
|
||||||
|
|| t.first == "<|tool_response>" // gemma4
|
||||||
|
|| t.first == "<|end▁of▁sentence|>" // deepseek-ocr
|
||||||
) {
|
) {
|
||||||
special_eog_ids.insert(t.second);
|
special_eog_ids.insert(t.second);
|
||||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
|
@ -2564,6 +2655,33 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// workaround for gemma4 and paddleocr: do not include </s> as an eog token
|
||||||
|
{
|
||||||
|
bool has_tool_response = false;
|
||||||
|
bool has_s = false;
|
||||||
|
|
||||||
|
llama_token s_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
|
for (auto tid : special_eog_ids) {
|
||||||
|
const auto & text = id_to_token[tid].text;
|
||||||
|
if (text == "<|tool_response>") {
|
||||||
|
has_tool_response = true;
|
||||||
|
} else if (text == "</s>") {
|
||||||
|
has_s = true;
|
||||||
|
s_id = tid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (has_tool_response && has_s) {
|
||||||
|
special_eog_ids.erase(s_id);
|
||||||
|
|
||||||
|
auto & attr = id_to_token[s_id].attr;
|
||||||
|
attr = LLAMA_TOKEN_ATTR_NORMAL;
|
||||||
|
|
||||||
|
LLAMA_LOG_WARN("%s: special_eog_ids contains '<|tool_response>', removing '</s>' token from EOG list\n", __func__);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// build special tokens cache
|
// build special tokens cache
|
||||||
|
|
@ -2732,7 +2850,9 @@ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
|
||||||
return strtol(buf.c_str(), NULL, 16);
|
return strtol(buf.c_str(), NULL, 16);
|
||||||
}
|
}
|
||||||
case LLAMA_VOCAB_TYPE_BPE: {
|
case LLAMA_VOCAB_TYPE_BPE: {
|
||||||
GGML_ABORT("fatal error");
|
// Gemma4 uses BPE with SPM-style byte fallback tokens (<0xXX>)
|
||||||
|
auto buf = token_data.text.substr(3, 2);
|
||||||
|
return strtol(buf.c_str(), NULL, 16);
|
||||||
}
|
}
|
||||||
case LLAMA_VOCAB_TYPE_WPM: {
|
case LLAMA_VOCAB_TYPE_WPM: {
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
|
@ -3021,6 +3141,10 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
|
||||||
|
if (escape_whitespaces) {
|
||||||
|
llama_escape_whitespace(text);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef PRETOKENIZERDEBUG
|
#ifdef PRETOKENIZERDEBUG
|
||||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -3200,9 +3324,19 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
||||||
return _try_copy(token_text.data(), token_text.size());
|
return _try_copy(token_text.data(), token_text.size());
|
||||||
}
|
}
|
||||||
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
||||||
|
if (escape_whitespaces) {
|
||||||
|
// SPM-style BPE: tokens contain ▁ for spaces
|
||||||
|
std::string result = token_text;
|
||||||
|
llama_unescape_whitespace(result);
|
||||||
|
return _try_copy(result.data(), result.size());
|
||||||
|
}
|
||||||
std::string result = llama_decode_text(token_text);
|
std::string result = llama_decode_text(token_text);
|
||||||
return _try_copy(result.data(), result.size());
|
return _try_copy(result.data(), result.size());
|
||||||
}
|
}
|
||||||
|
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
|
||||||
|
char byte = (char) token_to_byte(token);
|
||||||
|
return _try_copy((char*) &byte, 1);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case LLAMA_VOCAB_TYPE_RWKV: {
|
case LLAMA_VOCAB_TYPE_RWKV: {
|
||||||
|
|
@ -3630,9 +3764,7 @@ int llama_vocab::max_token_len() const {
|
||||||
|
|
||||||
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
||||||
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
||||||
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
|
||||||
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
||||||
GGML_ASSERT(token_right.find('\n') == std::string::npos);
|
|
||||||
|
|
||||||
auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
|
auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||||
if (it == pimpl->bpe_ranks.end()) {
|
if (it == pimpl->bpe_ranks.end()) {
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,7 @@ enum llama_vocab_pre_type {
|
||||||
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
|
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
|
||||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLM_KV;
|
struct LLM_KV;
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
|
|
||||||
#include "llama-chat.h"
|
#include "llama-chat.h"
|
||||||
|
|
@ -12,6 +11,7 @@
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpp.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "gguf.h"
|
#include "gguf.h"
|
||||||
|
|
||||||
|
|
@ -24,6 +24,7 @@
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
|
@ -45,722 +46,6 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_device_memory_data {
|
|
||||||
int64_t total;
|
|
||||||
int64_t free;
|
|
||||||
llama_memory_breakdown_data mb;
|
|
||||||
};
|
|
||||||
|
|
||||||
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|
||||||
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
|
|
||||||
std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
|
|
||||||
const ggml_log_level log_level) {
|
|
||||||
struct user_data_t {
|
|
||||||
struct {
|
|
||||||
ggml_log_callback callback;
|
|
||||||
void * user_data;
|
|
||||||
} original_logger;
|
|
||||||
ggml_log_level min_level; // prints below this log level go to debug log
|
|
||||||
};
|
|
||||||
user_data_t ud;
|
|
||||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
|
||||||
ud.min_level = log_level;
|
|
||||||
|
|
||||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
|
||||||
const user_data_t * ud = (const user_data_t *) user_data;
|
|
||||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
|
||||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
|
||||||
}, &ud);
|
|
||||||
|
|
||||||
llama_model_params mparams_copy = *mparams;
|
|
||||||
mparams_copy.no_alloc = true;
|
|
||||||
mparams_copy.use_mmap = false;
|
|
||||||
mparams_copy.use_mlock = false;
|
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
|
||||||
if (model == nullptr) {
|
|
||||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
|
||||||
throw std::runtime_error("failed to load model");
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_context * ctx = llama_init_from_model(model, *cparams);
|
|
||||||
if (ctx == nullptr) {
|
|
||||||
llama_model_free(model);
|
|
||||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
|
||||||
throw std::runtime_error("failed to create llama_context from model");
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_device_memory_data> ret(model->devices.size());
|
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
|
||||||
|
|
||||||
for (const auto & [buft, mb] : memory_breakdown) {
|
|
||||||
if (ggml_backend_buft_is_host(buft)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
|
||||||
if (!dev) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (size_t i = 0; i < ret.size(); i++) {
|
|
||||||
if (model->devices[i] == dev) {
|
|
||||||
ret[i].mb.model += mb.model;
|
|
||||||
ret[i].mb.context += mb.context;
|
|
||||||
ret[i].mb.compute += mb.compute;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (size_t i = 0; i < ret.size(); i++) {
|
|
||||||
size_t free;
|
|
||||||
size_t total;
|
|
||||||
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
|
||||||
|
|
||||||
// devices can return 0 bytes for free and total memory if they do not
|
|
||||||
// have any to report. in this case, we will use the host memory as a fallback
|
|
||||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
|
||||||
if (free == 0 && total == 0) {
|
|
||||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
||||||
if (cpu_dev == nullptr) {
|
|
||||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
||||||
}
|
|
||||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
|
||||||
}
|
|
||||||
ret[i].free = free;
|
|
||||||
ret[i].total = total;
|
|
||||||
}
|
|
||||||
|
|
||||||
devs = model->devices;
|
|
||||||
hp_ngl = model->hparams.n_layer;
|
|
||||||
hp_n_ctx_train = model->hparams.n_ctx_train;
|
|
||||||
hp_n_expert = model->hparams.n_expert;
|
|
||||||
|
|
||||||
llama_memory_breakdown_print(ctx); // goes to debug log
|
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_model_free(model);
|
|
||||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
// enum to identify part of a layer for distributing its tensors:
|
|
||||||
enum layer_fraction_t {
|
|
||||||
LAYER_FRACTION_NONE = 0, // nothing
|
|
||||||
LAYER_FRACTION_ATTN = 1, // attention
|
|
||||||
LAYER_FRACTION_UP = 2, // attention + up
|
|
||||||
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
|
||||||
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
|
||||||
};
|
|
||||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
|
||||||
|
|
||||||
class llama_params_fit_exception : public std::runtime_error {
|
|
||||||
using std::runtime_error::runtime_error;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void llama_params_fit_impl(
|
|
||||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
||||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
||||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
||||||
constexpr int64_t MiB = 1024*1024;
|
|
||||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
|
||||||
const llama_model_params default_mparams = llama_model_default_params();
|
|
||||||
|
|
||||||
std::vector<ggml_backend_dev_t> devs;
|
|
||||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
|
||||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
|
||||||
uint32_t hp_nex = 0; // hparams.n_expert
|
|
||||||
|
|
||||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
|
||||||
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
||||||
const size_t nd = devs.size(); // number of devices
|
|
||||||
if (nd == 0) {
|
|
||||||
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
|
||||||
margins.reserve(nd);
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
margins.push_back(margins_s[id]);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> dev_names;
|
|
||||||
{
|
|
||||||
dev_names.reserve(nd);
|
|
||||||
size_t max_length = 0;
|
|
||||||
for (ggml_backend_dev_t dev : devs) {
|
|
||||||
std::string name = ggml_backend_dev_name(dev);
|
|
||||||
name += " (";
|
|
||||||
name += ggml_backend_dev_description(dev);
|
|
||||||
name += ")";
|
|
||||||
dev_names.push_back(name);
|
|
||||||
max_length = std::max(max_length, name.length());
|
|
||||||
}
|
|
||||||
for (std::string & dn : dev_names) {
|
|
||||||
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t sum_free = 0;
|
|
||||||
int64_t sum_projected_free = 0;
|
|
||||||
int64_t sum_projected_used = 0;
|
|
||||||
int64_t sum_projected_model = 0;
|
|
||||||
std::vector<int64_t> projected_free_per_device;
|
|
||||||
projected_free_per_device.reserve(nd);
|
|
||||||
|
|
||||||
if (nd > 1) {
|
|
||||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
|
||||||
}
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
const llama_device_memory_data & dmd = dmds_full[id];
|
|
||||||
|
|
||||||
const int64_t projected_used = dmd.mb.total();
|
|
||||||
const int64_t projected_free = dmd.free - projected_used;
|
|
||||||
projected_free_per_device.push_back(projected_free);
|
|
||||||
|
|
||||||
sum_free += dmd.free;
|
|
||||||
sum_projected_used += projected_used;
|
|
||||||
sum_projected_free += projected_free;
|
|
||||||
sum_projected_model += dmd.mb.model;
|
|
||||||
|
|
||||||
if (nd > 1) {
|
|
||||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
|
||||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
|
||||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
|
||||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
|
||||||
if (nd == 1) {
|
|
||||||
if (projected_free_per_device[0] >= margins[0]) {
|
|
||||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
|
||||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
bool changes_needed = false;
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
if (projected_free_per_device[id] < margins[id]) {
|
|
||||||
changes_needed = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!changes_needed) {
|
|
||||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// step 2: try reducing memory use by reducing the context size
|
|
||||||
|
|
||||||
{
|
|
||||||
int64_t global_surplus = sum_projected_free;
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
global_surplus -= margins[id];
|
|
||||||
}
|
|
||||||
if (global_surplus < 0) {
|
|
||||||
if (nd == 1) {
|
|
||||||
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
|
||||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
|
||||||
} else {
|
|
||||||
LLAMA_LOG_INFO(
|
|
||||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
|
||||||
__func__, -global_surplus/MiB);
|
|
||||||
}
|
|
||||||
if (cparams->n_ctx == 0) {
|
|
||||||
if (hp_nct > n_ctx_min) {
|
|
||||||
int64_t sum_used_target = sum_free;
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
sum_used_target -= margins[id];
|
|
||||||
}
|
|
||||||
if (nd > 1) {
|
|
||||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
|
||||||
// - for dense models only whole layers can be assigned to devices
|
|
||||||
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
|
|
||||||
// - on average we expect a waste of 0.5 layers/tensors per device
|
|
||||||
// - use slightly more than the expected average for nd devices to be safe
|
|
||||||
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
|
||||||
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t sum_projected_used_min_ctx = 0;
|
|
||||||
cparams->n_ctx = n_ctx_min;
|
|
||||||
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
||||||
for (const auto & dmd : dmds_min_ctx) {
|
|
||||||
sum_projected_used_min_ctx += dmd.mb.total();
|
|
||||||
}
|
|
||||||
if (sum_used_target > sum_projected_used_min_ctx) {
|
|
||||||
// linear interpolation between minimum and maximum context size:
|
|
||||||
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
|
||||||
/ (sum_projected_used - sum_projected_used_min_ctx);
|
|
||||||
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
|
||||||
|
|
||||||
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
|
||||||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
|
||||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
|
||||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
|
||||||
if (nd == 1) {
|
|
||||||
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
|
||||||
} else {
|
|
||||||
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
|
||||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
|
||||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (n_ctx_min == UINT32_MAX) {
|
|
||||||
LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
|
||||||
} else {
|
|
||||||
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
|
||||||
__func__, hp_nct, n_ctx_min);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
|
||||||
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
|
||||||
}
|
|
||||||
if (nd > 1) {
|
|
||||||
if (!tensor_split) {
|
|
||||||
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
|
||||||
}
|
|
||||||
if (mparams->tensor_split) {
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
if (mparams->tensor_split[id] != 0.0f) {
|
|
||||||
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
||||||
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!tensor_buft_overrides) {
|
|
||||||
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
|
||||||
}
|
|
||||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
|
||||||
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
|
||||||
}
|
|
||||||
|
|
||||||
// step 3: iteratively fill the back to front with "dense" layers
|
|
||||||
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
|
||||||
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
|
||||||
|
|
||||||
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
|
||||||
auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
|
|
||||||
constexpr size_t n_strings = 1000;
|
|
||||||
if (il >= n_strings) {
|
|
||||||
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
|
||||||
}
|
|
||||||
switch (lf) {
|
|
||||||
case LAYER_FRACTION_ATTN: {
|
|
||||||
static std::array<std::string, n_strings> patterns;
|
|
||||||
if (patterns[il].empty()) {
|
|
||||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
|
|
||||||
}
|
|
||||||
return patterns[il].c_str();
|
|
||||||
}
|
|
||||||
case LAYER_FRACTION_UP: {
|
|
||||||
static std::array<std::string, n_strings> patterns;
|
|
||||||
if (patterns[il].empty()) {
|
|
||||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
|
|
||||||
}
|
|
||||||
return patterns[il].c_str();
|
|
||||||
}
|
|
||||||
case LAYER_FRACTION_GATE: {
|
|
||||||
static std::array<std::string, n_strings> patterns;
|
|
||||||
if (patterns[il].empty()) {
|
|
||||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
|
||||||
}
|
|
||||||
return patterns[il].c_str();
|
|
||||||
}
|
|
||||||
case LAYER_FRACTION_MOE: {
|
|
||||||
static std::array<std::string, n_strings> patterns;
|
|
||||||
if (patterns[il].empty()) {
|
|
||||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
|
|
||||||
}
|
|
||||||
return patterns[il].c_str();
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ngl_t {
|
|
||||||
uint32_t n_layer = 0; // number of total layers
|
|
||||||
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
|
||||||
|
|
||||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
|
||||||
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
|
||||||
|
|
||||||
uint32_t n_full() const {
|
|
||||||
assert(n_layer >= n_part);
|
|
||||||
return n_layer - n_part;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
||||||
|
|
||||||
// utility function to set n_gpu_layers and tensor_split
|
|
||||||
auto set_ngl_tensor_split_tbo = [&](
|
|
||||||
const std::vector<ngl_t> & ngl_per_device,
|
|
||||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
|
||||||
llama_model_params & mparams) {
|
|
||||||
mparams.n_gpu_layers = 0;
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
|
||||||
if (nd > 1) {
|
|
||||||
tensor_split[id] = ngl_per_device[id].n_layer;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
|
|
||||||
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
|
|
||||||
|
|
||||||
mparams.tensor_split = tensor_split;
|
|
||||||
|
|
||||||
size_t itbo = 0;
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
il0 += ngl_per_device[id].n_full();
|
|
||||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
|
||||||
if (itbo + 1 >= ntbo) {
|
|
||||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
|
||||||
tensor_buft_overrides[itbo].buft = nullptr;
|
|
||||||
itbo++;
|
|
||||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
|
||||||
throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
|
||||||
+ std::to_string(ntbo) + " is insufficient for model");
|
|
||||||
}
|
|
||||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
|
||||||
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
|
||||||
itbo++;
|
|
||||||
}
|
|
||||||
il0 += ngl_per_device[id].n_part;
|
|
||||||
}
|
|
||||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
|
||||||
tensor_buft_overrides[itbo].buft = nullptr;
|
|
||||||
itbo++;
|
|
||||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
|
||||||
};
|
|
||||||
|
|
||||||
// utility function that returns the memory use per device for given numbers of layers per device
|
|
||||||
auto get_memory_for_layers = [&](
|
|
||||||
const char * func_name,
|
|
||||||
const std::vector<ngl_t> & ngl_per_device,
|
|
||||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
|
|
||||||
llama_model_params mparams_copy = *mparams;
|
|
||||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
|
||||||
|
|
||||||
const dmds_t dmd_nl = llama_get_device_memory_data(
|
|
||||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
const ngl_t & n = ngl_per_device[id];
|
|
||||||
LLAMA_LOG_DEBUG(
|
|
||||||
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
|
||||||
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int64_t> ret;
|
|
||||||
ret.reserve(nd);
|
|
||||||
for (const llama_device_memory_data & dmd : dmd_nl) {
|
|
||||||
ret.push_back(dmd.mb.total());
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
};
|
|
||||||
|
|
||||||
int64_t global_surplus_cpu_moe = 0;
|
|
||||||
if (hp_nex > 0) {
|
|
||||||
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
|
|
||||||
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
|
||||||
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
|
||||||
tensor_buft_overrides[1] = {nullptr, nullptr};
|
|
||||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
|
||||||
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
|
||||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
||||||
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
|
||||||
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (global_surplus_cpu_moe > 0) {
|
|
||||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
|
||||||
__func__, global_surplus_cpu_moe/MiB);
|
|
||||||
} else {
|
|
||||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
|
||||||
__func__, -global_surplus_cpu_moe/MiB);
|
|
||||||
}
|
|
||||||
|
|
||||||
// reset
|
|
||||||
tensor_buft_overrides[0] = {nullptr, nullptr};
|
|
||||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
|
||||||
targets.reserve(nd);
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
targets.push_back(dmds_full[id].free - margins[id]);
|
|
||||||
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
|
||||||
overflow_bufts.reserve(nd);
|
|
||||||
for (size_t id = 0; id < nd; id++) {
|
|
||||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<ngl_t> ngl_per_device(nd);
|
|
||||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
|
||||||
|
|
||||||
// optimize the number of layers per device using the method of false position:
|
|
||||||
// - ngl_per_device has 0 layers for each device, lower bound
|
|
||||||
// - try a "high" configuration where a device is given all unassigned layers
|
|
||||||
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
|
||||||
// - check memory use of our guess, replace either the low or high bound
|
|
||||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
|
||||||
// - the last device has the output layer, which cannot be a partial layer
|
|
||||||
if (hp_nex == 0) {
|
|
||||||
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
|
||||||
} else {
|
|
||||||
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
|
||||||
}
|
|
||||||
for (int id = nd - 1; id >= 0; id--) {
|
|
||||||
uint32_t n_unassigned = hp_ngl + 1;
|
|
||||||
for (size_t jd = id + 1; jd < nd; ++jd) {
|
|
||||||
assert(n_unassigned >= ngl_per_device[jd].n_layer);
|
|
||||||
n_unassigned -= ngl_per_device[jd].n_layer;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
||||||
ngl_per_device_high[id].n_layer = n_unassigned;
|
|
||||||
if (hp_nex > 0) {
|
|
||||||
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
|
|
||||||
}
|
|
||||||
if (ngl_per_device_high[id].n_layer > 0) {
|
|
||||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
||||||
if (mem_high[id] > targets[id]) {
|
|
||||||
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
|
||||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
||||||
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
|
||||||
while (delta > 1) {
|
|
||||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
||||||
step_size = std::max(step_size, uint32_t(1));
|
|
||||||
step_size = std::min(step_size, delta - 1);
|
|
||||||
|
|
||||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
||||||
ngl_per_device_test[id].n_layer += step_size;
|
|
||||||
if (hp_nex) {
|
|
||||||
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
|
||||||
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
|
||||||
}
|
|
||||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
||||||
|
|
||||||
if (mem_test[id] <= targets[id]) {
|
|
||||||
ngl_per_device = ngl_per_device_test;
|
|
||||||
mem = mem_test;
|
|
||||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
|
||||||
} else {
|
|
||||||
ngl_per_device_high = ngl_per_device_test;
|
|
||||||
mem_high = mem_test;
|
|
||||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
|
||||||
}
|
|
||||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
|
||||||
ngl_per_device = ngl_per_device_high;
|
|
||||||
mem = mem_high;
|
|
||||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
||||||
LLAMA_LOG_INFO(
|
|
||||||
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
||||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
|
||||||
}
|
|
||||||
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
|
||||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// step 4: for a MoE model where all dense tensors fit,
|
|
||||||
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
|
||||||
// essentially the same procedure as for the dense-only layers except front-to-back
|
|
||||||
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
|
||||||
|
|
||||||
size_t id_dense_start = nd;
|
|
||||||
for (int id = nd - 1; id >= 0; id--) {
|
|
||||||
if (ngl_per_device[id].n_layer > 0) {
|
|
||||||
id_dense_start = id;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
assert(id_dense_start < nd);
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
|
||||||
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
|
||||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
||||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
|
||||||
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
|
||||||
ngl_per_device_high[id].n_layer += n_layer_move;
|
|
||||||
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
|
||||||
ngl_per_device_high[jd].n_part = 0;
|
|
||||||
}
|
|
||||||
size_t id_dense_start_high = nd - 1;
|
|
||||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
||||||
|
|
||||||
if (mem_high[id] > targets[id]) {
|
|
||||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
||||||
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
||||||
while (delta > 1) {
|
|
||||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
||||||
step_size = std::max(step_size, uint32_t(1));
|
|
||||||
step_size = std::min(step_size, delta - 1);
|
|
||||||
|
|
||||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
||||||
size_t id_dense_start_test = id_dense_start;
|
|
||||||
uint32_t n_converted_test = 0;
|
|
||||||
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
|
||||||
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
|
||||||
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
|
||||||
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
|
||||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
|
||||||
n_converted_test += n_convert_jd;
|
|
||||||
|
|
||||||
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
||||||
|
|
||||||
if (mem_test[id] <= targets[id]) {
|
|
||||||
ngl_per_device = ngl_per_device_test;
|
|
||||||
mem = mem_test;
|
|
||||||
id_dense_start = id_dense_start_test;
|
|
||||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
|
||||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
||||||
} else {
|
|
||||||
ngl_per_device_high = ngl_per_device_test;
|
|
||||||
mem_high = mem_test;
|
|
||||||
id_dense_start_high = id_dense_start_test;
|
|
||||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
|
||||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
|
||||||
}
|
|
||||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
||||||
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
ngl_per_device = ngl_per_device_high;
|
|
||||||
mem = mem_high;
|
|
||||||
id_dense_start = id_dense_start_high;
|
|
||||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
|
||||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
||||||
}
|
|
||||||
|
|
||||||
// try to fit at least part of one more layer
|
|
||||||
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
|
|
||||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
||||||
size_t id_dense_start_test = id_dense_start;
|
|
||||||
ngl_per_device_test[id_dense_start_test].n_layer--;
|
|
||||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
|
||||||
ngl_per_device_test[id].n_layer++;
|
|
||||||
ngl_per_device_test[id].n_part++;
|
|
||||||
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
|
||||||
id_dense_start_test++;
|
|
||||||
}
|
|
||||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
|
||||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
|
||||||
if (id < nd - 1) {
|
|
||||||
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
|
||||||
}
|
|
||||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
|
||||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
||||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
||||||
ngl_per_device = ngl_per_device_test;
|
|
||||||
overflow_bufts = overflow_bufts_test;
|
|
||||||
mem = mem_test;
|
|
||||||
id_dense_start = id_dense_start_test;
|
|
||||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
|
||||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
||||||
|
|
||||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
|
||||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
|
||||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
||||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
||||||
ngl_per_device = ngl_per_device_test;
|
|
||||||
overflow_bufts = overflow_bufts_test;
|
|
||||||
mem = mem_test;
|
|
||||||
id_dense_start = id_dense_start_test;
|
|
||||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
|
||||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
|
||||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
|
||||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
||||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
||||||
ngl_per_device = ngl_per_device_test;
|
|
||||||
overflow_bufts = overflow_bufts_test;
|
|
||||||
mem = mem_test;
|
|
||||||
id_dense_start = id_dense_start_test;
|
|
||||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
|
||||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
||||||
LLAMA_LOG_INFO(
|
|
||||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
||||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
||||||
}
|
|
||||||
|
|
||||||
// print info for devices that were not changed during the conversion from dense only to full layers:
|
|
||||||
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
|
||||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
||||||
LLAMA_LOG_INFO(
|
|
||||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
||||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
||||||
}
|
|
||||||
|
|
||||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
||||||
}
|
|
||||||
|
|
||||||
enum llama_params_fit_status llama_params_fit(
|
|
||||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
||||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
||||||
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
||||||
const int64_t t0_us = llama_time_us();
|
|
||||||
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
|
||||||
try {
|
|
||||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
|
||||||
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
|
||||||
} catch (const llama_params_fit_exception & e) {
|
|
||||||
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
|
||||||
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
|
|
||||||
} catch (const std::runtime_error & e) {
|
|
||||||
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
|
||||||
status = LLAMA_PARAMS_FIT_STATUS_ERROR;
|
|
||||||
}
|
|
||||||
const int64_t t1_us = llama_time_us();
|
|
||||||
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||||
struct llama_sampler_chain_params result = {
|
struct llama_sampler_chain_params result = {
|
||||||
/*.no_perf =*/ true,
|
/*.no_perf =*/ true,
|
||||||
|
|
@ -828,7 +113,7 @@ int64_t llama_time_us(void) {
|
||||||
|
|
||||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||||
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
|
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
|
||||||
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model & model, llama_model_params & params) {
|
||||||
// loading time will be recalculated after the first eval, so
|
// loading time will be recalculated after the first eval, so
|
||||||
// we take page faults deferred by mmap() into consideration
|
// we take page faults deferred by mmap() into consideration
|
||||||
model.t_load_us = 0;
|
model.t_load_us = 0;
|
||||||
|
|
@ -837,7 +122,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens
|
||||||
model.t_start_us = tm.t_start_us;
|
model.t_start_us = tm.t_start_us;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
|
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
|
||||||
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||||
|
|
||||||
ml.print_info();
|
ml.print_info();
|
||||||
|
|
@ -889,8 +174,24 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
void * set_tensor_data_ud,
|
void * set_tensor_data_ud,
|
||||||
const std::string & path_model,
|
const std::string & path_model,
|
||||||
std::vector<std::string> & splits,
|
std::vector<std::string> & splits,
|
||||||
|
FILE * file,
|
||||||
struct llama_model_params params) {
|
struct llama_model_params params) {
|
||||||
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
|
{
|
||||||
|
int n_sources_defined = 0;
|
||||||
|
if (metadata != nullptr) {
|
||||||
|
n_sources_defined++;
|
||||||
|
}
|
||||||
|
if (!path_model.empty()) {
|
||||||
|
n_sources_defined++;
|
||||||
|
}
|
||||||
|
if (file != nullptr) {
|
||||||
|
n_sources_defined++;
|
||||||
|
}
|
||||||
|
if (n_sources_defined != 1) {
|
||||||
|
LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
||||||
|
|
@ -919,58 +220,111 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
|
|
||||||
// create list of devices to use with this model
|
// create list of devices to use with this model
|
||||||
if (params.devices) {
|
if (params.devices) {
|
||||||
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||||
model->devices.push_back(*dev);
|
size_t n_devs = 0;
|
||||||
|
while (params.devices[n_devs]) {
|
||||||
|
n_devs++;
|
||||||
|
}
|
||||||
|
if (n_devs == 0) {
|
||||||
|
LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
LLAMA_LOG_INFO("%s: creating a Meta device with %zu devices\n", __func__, n_devs);
|
||||||
|
for (size_t i = 0; i < n_devs; ++i) {
|
||||||
|
LLAMA_LOG_INFO("%s: - device %zu: %s\n", __func__, i, ggml_backend_dev_name(params.devices[i]));
|
||||||
|
}
|
||||||
|
model->get_split_state_ud.n_devices = n_devs;
|
||||||
|
model->get_split_state_ud.model = model;
|
||||||
|
model->devices.push_back({
|
||||||
|
true, ggml_backend_meta_device(
|
||||||
|
params.devices, n_devs, llama_meta_device_get_split_state, &model->get_split_state_ud)
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
||||||
|
model->devices.push_back({false, *dev});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// default device selection
|
// default device selection
|
||||||
|
|
||||||
// build list of available devices
|
// build list of available devices
|
||||||
std::vector<ggml_backend_dev_t> gpus;
|
std::vector<llama_device> gpus;
|
||||||
std::vector<ggml_backend_dev_t> igpus;
|
std::vector<llama_device> igpus;
|
||||||
std::vector<ggml_backend_dev_t> rpc_servers;
|
std::vector<llama_device> rpc_servers;
|
||||||
|
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
std::vector<ggml_backend_dev_t> devs;
|
||||||
switch (ggml_backend_dev_type(dev)) {
|
devs.reserve(ggml_backend_dev_count());
|
||||||
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
auto * dev = ggml_backend_dev_get(i);
|
||||||
// skip CPU backends since they are handled separately
|
if (ggml_backend_dev_buffer_type(dev) == ggml_backend_cpu_buffer_type()) {
|
||||||
break;
|
LLAMA_LOG_INFO("%s: skipping %s (%s) for tensor parallelism\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev));
|
||||||
|
continue;
|
||||||
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
|
||||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
||||||
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
|
||||||
rpc_servers.push_back(dev);
|
|
||||||
} else {
|
|
||||||
// check if there is already a GPU with the same device id
|
|
||||||
ggml_backend_dev_props props;
|
|
||||||
ggml_backend_dev_get_props(dev, &props);
|
|
||||||
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
|
||||||
ggml_backend_dev_props d_props;
|
|
||||||
ggml_backend_dev_get_props(d, &d_props);
|
|
||||||
if (props.device_id && d_props.device_id) {
|
|
||||||
return strcmp(props.device_id, d_props.device_id) == 0;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
});
|
|
||||||
|
|
||||||
if (it != gpus.end()) {
|
|
||||||
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
|
||||||
__func__,
|
|
||||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
|
||||||
props.device_id ? props.device_id : "unknown id",
|
|
||||||
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
|
||||||
} else {
|
|
||||||
gpus.push_back(dev);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
devs.push_back(dev);
|
||||||
|
}
|
||||||
|
if (devs.empty()) {
|
||||||
|
LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
LLAMA_LOG_INFO("%s: creating a Meta device for tensor parallelism from %zu devices:\n", __func__, devs.size());
|
||||||
igpus.push_back(dev);
|
for (size_t i = 0; i < devs.size(); ++i) {
|
||||||
break;
|
LLAMA_LOG_INFO("%s: - device %zu: %s (%s)\n", __func__, i, ggml_backend_dev_name(devs[i]), ggml_backend_dev_description(devs[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(!devs.empty());
|
||||||
|
model->get_split_state_ud.n_devices = devs.size();
|
||||||
|
model->get_split_state_ud.model = model;
|
||||||
|
gpus.push_back({
|
||||||
|
true, ggml_backend_meta_device(
|
||||||
|
devs.data(), devs.size(), llama_meta_device_get_split_state, &model->get_split_state_ud)
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
|
switch (ggml_backend_dev_type(dev)) {
|
||||||
|
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||||
|
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||||
|
// skip CPU backends since they are handled separately
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
||||||
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||||
|
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||||
|
rpc_servers.push_back({false, dev});
|
||||||
|
} else {
|
||||||
|
// check if there is already a GPU with the same device id
|
||||||
|
ggml_backend_dev_props props;
|
||||||
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
|
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](const llama_device & d) {
|
||||||
|
ggml_backend_dev_props d_props;
|
||||||
|
ggml_backend_dev_get_props(d.dev, &d_props);
|
||||||
|
if (props.device_id && d_props.device_id) {
|
||||||
|
return strcmp(props.device_id, d_props.device_id) == 0;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (it != gpus.end()) {
|
||||||
|
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
||||||
|
__func__,
|
||||||
|
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||||
|
props.device_id ? props.device_id : "unknown id",
|
||||||
|
ggml_backend_dev_name(it->dev), ggml_backend_dev_description(it->dev));
|
||||||
|
} else {
|
||||||
|
gpus.push_back({false, dev});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||||
|
igpus.push_back({false, dev});
|
||||||
|
break;
|
||||||
|
case GGML_BACKEND_DEVICE_TYPE_META:
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -996,22 +350,22 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
llama_model_free(model);
|
llama_model_free(model);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
llama_device main_gpu = model->devices[params.main_gpu];
|
||||||
model->devices.clear();
|
model->devices.clear();
|
||||||
model->devices.push_back(main_gpu);
|
model->devices.push_back(main_gpu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto * dev : model->devices) {
|
for (const auto & dev : model->devices) {
|
||||||
ggml_backend_dev_props props;
|
ggml_backend_dev_props props;
|
||||||
ggml_backend_dev_get_props(dev, &props);
|
ggml_backend_dev_get_props(dev.dev, &props);
|
||||||
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
||||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
ggml_backend_dev_name(dev.dev), ggml_backend_dev_description(dev.dev),
|
||||||
props.device_id ? props.device_id : "unknown id",
|
props.device_id ? props.device_id : "unknown id",
|
||||||
props.memory_free/1024/1024);
|
props.memory_free/1024/1024);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
|
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params);
|
||||||
GGML_ASSERT(status <= 0);
|
GGML_ASSERT(status <= 0);
|
||||||
if (status < 0) {
|
if (status < 0) {
|
||||||
if (status == -1) {
|
if (status == -1) {
|
||||||
|
|
@ -1037,7 +391,7 @@ struct llama_model * llama_model_init_from_user(
|
||||||
std::vector<std::string> splits = {};
|
std::vector<std::string> splits = {};
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
params.use_extra_bufts = false;
|
params.use_extra_bufts = false;
|
||||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params);
|
||||||
}
|
}
|
||||||
// deprecated
|
// deprecated
|
||||||
struct llama_model * llama_load_model_from_file(
|
struct llama_model * llama_load_model_from_file(
|
||||||
|
|
@ -1050,7 +404,7 @@ struct llama_model * llama_model_load_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_model_params params) {
|
struct llama_model_params params) {
|
||||||
std::vector<std::string> splits = {};
|
std::vector<std::string> splits = {};
|
||||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
|
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * llama_model_load_from_splits(
|
struct llama_model * llama_model_load_from_splits(
|
||||||
|
|
@ -1066,7 +420,17 @@ struct llama_model * llama_model_load_from_splits(
|
||||||
for (size_t i = 0; i < n_paths; ++i) {
|
for (size_t i = 0; i < n_paths; ++i) {
|
||||||
splits.push_back(paths[i]);
|
splits.push_back(paths[i]);
|
||||||
}
|
}
|
||||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
|
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) {
|
||||||
|
if (!file) {
|
||||||
|
LLAMA_LOG_ERROR("%s: file is NULL\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
std::string path_model;
|
||||||
|
std::vector<std::string> splits = {};
|
||||||
|
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||||
|
|
|
||||||
|
|
@ -154,6 +154,7 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors
|
||||||
|
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
|
@ -191,9 +192,10 @@ extern "C" {
|
||||||
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
|
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||||
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
||||||
|
LLAMA_SPLIT_MODE_TENSOR = 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||||
|
|
@ -380,22 +382,33 @@ extern "C" {
|
||||||
size_t n_samplers;
|
size_t n_samplers;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llama_model_tensor_override {
|
||||||
|
const char * pattern;
|
||||||
|
enum ggml_type type;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_model_imatrix_data {
|
||||||
|
const char * name;
|
||||||
|
const float * data;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
typedef struct llama_model_quantize_params {
|
typedef struct llama_model_quantize_params {
|
||||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
enum ggml_type output_tensor_type; // output tensor type
|
enum ggml_type output_tensor_type; // output tensor type
|
||||||
enum ggml_type token_embedding_type; // token embeddings tensor type
|
enum ggml_type token_embedding_type; // token embeddings tensor type
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
bool pure; // quantize all tensors to the default type
|
bool pure; // quantize all tensors to the default type
|
||||||
bool keep_split; // quantize to the same number of shards
|
bool keep_split; // quantize to the same number of shards
|
||||||
bool dry_run; // calculate and show the final quantization size without performing quantization
|
bool dry_run; // calculate and show the final quantization size without performing quantization
|
||||||
void * imatrix; // pointer to importance matrix data
|
const struct llama_model_imatrix_data * imatrix; // pointer to importance matrix data
|
||||||
void * kv_overrides; // pointer to vector containing overrides
|
const struct llama_model_kv_override * kv_overrides; // pointer to kv overrides
|
||||||
void * tensor_types; // pointer to vector containing tensor types
|
const struct llama_model_tensor_override * tt_overrides; // pointer to tensor overrides
|
||||||
void * prune_layers; // pointer to vector containing layer indices to prune
|
const int32_t * prune_layers; // pointer to layer indices to prune
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
typedef struct llama_logit_bias {
|
typedef struct llama_logit_bias {
|
||||||
|
|
@ -465,6 +478,11 @@ extern "C" {
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_model_params params);
|
struct llama_model_params params);
|
||||||
|
|
||||||
|
// Load a model from an open FILE pointer
|
||||||
|
LLAMA_API struct llama_model * llama_model_load_from_file_ptr(
|
||||||
|
FILE * file,
|
||||||
|
struct llama_model_params params);
|
||||||
|
|
||||||
// Load a model from multiple splits (support custom naming scheme)
|
// Load a model from multiple splits (support custom naming scheme)
|
||||||
// The paths must be in the correct order
|
// The paths must be in the correct order
|
||||||
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
||||||
|
|
@ -493,27 +511,6 @@ extern "C" {
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
enum llama_params_fit_status {
|
|
||||||
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
|
||||||
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
|
||||||
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
|
|
||||||
};
|
|
||||||
|
|
||||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
|
||||||
// - returns true if the parameters could be successfully modified to fit device memory
|
|
||||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
|
||||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
|
||||||
// with the exception of the context size which is modified if and only if equal to 0
|
|
||||||
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
|
||||||
const char * path_model,
|
|
||||||
struct llama_model_params * mparams,
|
|
||||||
struct llama_context_params * cparams,
|
|
||||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
|
||||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
|
||||||
size_t * margins, // margins of memory to leave per device in bytes
|
|
||||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
|
||||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us(void);
|
LLAMA_API int64_t llama_time_us(void);
|
||||||
|
|
||||||
LLAMA_API size_t llama_max_devices(void);
|
LLAMA_API size_t llama_max_devices(void);
|
||||||
|
|
@ -636,7 +633,6 @@ extern "C" {
|
||||||
|
|
||||||
// Load a LoRA adapter from file
|
// Load a LoRA adapter from file
|
||||||
// The adapter is valid as long as the associated model is not freed
|
// The adapter is valid as long as the associated model is not freed
|
||||||
// All adapters must be loaded before context creation
|
|
||||||
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
const char * path_lora);
|
const char * path_lora);
|
||||||
|
|
@ -660,9 +656,8 @@ extern "C" {
|
||||||
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
||||||
|
|
||||||
// Manually free a LoRA adapter
|
// Manually free a LoRA adapter
|
||||||
// NOTE: loaded adapters will be free when the associated model is deleted
|
// NOTE: loaded adapters that are not manually freed will be freed when the associated model is deleted
|
||||||
LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||||
"adapters are now freed together with the associated model");
|
|
||||||
|
|
||||||
// Get the invocation tokens if the current lora is an alora
|
// Get the invocation tokens if the current lora is an alora
|
||||||
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
||||||
|
|
@ -1530,9 +1525,6 @@ extern "C" {
|
||||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||||
|
|
||||||
// print a breakdown of per-device memory use via LLAMA_LOG:
|
|
||||||
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// training
|
// training
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -41,22 +41,13 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
||||||
{
|
{
|
||||||
ggml_tensor * attn_inp = cur; // save input for gate computation
|
ggml_tensor * attn_inp = cur; // save input for gate computation
|
||||||
|
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
// compute gate from input
|
// compute gate from input
|
||||||
ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
|
ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
|
||||||
cb(gate, "attn_gate_proj", il);
|
cb(gate, "attn_gate_proj", il);
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
// Q/K normalization
|
// Q/K normalization
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
|
@ -77,10 +68,8 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
||||||
cb(Kcur, "Kcur_rope", il);
|
cb(Kcur, "Kcur_rope", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
NULL, NULL, // wo will be applied after gating
|
NULL, NULL, NULL, // wo will be applied after gating
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
|
|
||||||
|
|
@ -91,7 +80,7 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
||||||
cb(cur, "attn_gated", il);
|
cb(cur, "attn_gated", il);
|
||||||
|
|
||||||
// now apply output projection
|
// now apply output projection
|
||||||
cur = build_lora_mm(model.layers[il].wo, cur);
|
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
||||||
cb(cur, "attn_o_proj", il);
|
cb(cur, "attn_o_proj", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
|
|
||||||
|
|
@ -32,25 +30,15 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Qcur, "Qcur_normed", il);
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Kcur, "Kcur_normed", il);
|
cb(Kcur, "Kcur_normed", il);
|
||||||
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
|
@ -62,7 +50,7 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
|
||||||
cb(Vcur, "Vcur_pos", il);
|
cb(Vcur, "Vcur_pos", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
|
|
||||||
|
|
@ -36,30 +35,8 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, rope_factors,
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
|
|
@ -78,7 +55,7 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -30,18 +30,8 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -60,7 +50,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
|
|
||||||
|
|
@ -29,18 +28,8 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
switch (model.type) {
|
switch (model.type) {
|
||||||
case LLM_TYPE_7B:
|
case LLM_TYPE_7B:
|
||||||
|
|
@ -67,7 +56,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,30 +28,8 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head_k, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, rope_factors,
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
|
|
@ -70,7 +48,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,6 @@
|
||||||
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
|
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
|
||||||
llm_graph_context(params) {
|
llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
|
|
||||||
|
|
@ -29,15 +28,8 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
|
||||||
|
|
||||||
// self_attention
|
// self_attention
|
||||||
{
|
{
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(cur, "wqkv", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
|
|
||||||
cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
||||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
|
||||||
cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
||||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
|
||||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Qcur, "Qcur_normed", il);
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
@ -56,7 +48,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
|
|
||||||
|
|
@ -28,8 +27,8 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// embed layer norm
|
// embed layer norm
|
||||||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, 0);
|
||||||
cb(inpL, "inp_norm", -1);
|
cb(inpL, "inp_norm", 0);
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_no_cache();
|
auto * inp_attn = build_attn_inp_no_cache();
|
||||||
|
|
||||||
|
|
@ -39,35 +38,8 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
||||||
ggml_tensor * cur = inpL;
|
ggml_tensor * cur = inpL;
|
||||||
|
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur;
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
ggml_tensor * Kcur;
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
ggml_tensor * Vcur;
|
|
||||||
|
|
||||||
// self-attention
|
|
||||||
if (model.layers[il].wqkv) {
|
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
||||||
cb(cur, "wqkv", il);
|
|
||||||
|
|
||||||
if (model.layers[il].bqkv) {
|
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
||||||
cb(cur, "bqkv", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
|
|
||||||
0 * sizeof(float) * (n_embd));
|
|
||||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
|
||||||
cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
||||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
|
||||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
||||||
} else {
|
|
||||||
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
||||||
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
||||||
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (model.layers[il].attn_q_norm) {
|
if (model.layers[il].attn_q_norm) {
|
||||||
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head * n_head, n_tokens);
|
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head * n_head, n_tokens);
|
||||||
|
|
@ -100,7 +72,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -28,33 +28,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
// B1.K
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
// B1.V
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -73,7 +48,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
NULL, NULL,
|
NULL, NULL, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
|
|
||||||
cur = build_norm(cur,
|
cur = build_norm(cur,
|
||||||
|
|
@ -82,8 +57,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
||||||
cb(cur, "attn_sub_norm", il);
|
cb(cur, "attn_sub_norm", il);
|
||||||
|
|
||||||
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
||||||
if (model.layers[il].bo) {
|
if (model.layers[il].wo_b) {
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
cur = ggml_add(ctx0, cur, model.layers[il].wo_b);
|
||||||
}
|
}
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
@ -121,6 +96,9 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
cb(cur, "l_out", il);
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
// input for next layer
|
// input for next layer
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
|
|
||||||
|
|
@ -16,8 +15,8 @@ llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_para
|
||||||
inpL = build_norm(inpL,
|
inpL = build_norm(inpL,
|
||||||
model.tok_norm,
|
model.tok_norm,
|
||||||
model.tok_norm_b,
|
model.tok_norm_b,
|
||||||
LLM_NORM, -1);
|
LLM_NORM, 0);
|
||||||
cb(inpL, "inp_norm", -1);
|
cb(inpL, "inp_norm", 0);
|
||||||
|
|
||||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
|
|
@ -30,22 +29,11 @@ llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_para
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(cur, "wqkv", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
||||||
cb(cur, "bqkv", il);
|
|
||||||
|
|
||||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -36,22 +36,10 @@ llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_gr
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
if (model.layers[il].attn_q_norm) {
|
if (model.layers[il].attn_q_norm) {
|
||||||
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
|
||||||
ggml_element_size(Qcur) * n_embd_head,
|
|
||||||
ggml_element_size(Qcur) * n_embd_head * n_head,
|
|
||||||
0);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur,
|
Qcur = build_norm(Qcur,
|
||||||
model.layers[il].attn_q_norm,
|
model.layers[il].attn_q_norm,
|
||||||
model.layers[il].attn_q_norm_b,
|
model.layers[il].attn_q_norm_b,
|
||||||
|
|
@ -60,12 +48,6 @@ llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_gr
|
||||||
}
|
}
|
||||||
|
|
||||||
if (model.layers[il].attn_k_norm) {
|
if (model.layers[il].attn_k_norm) {
|
||||||
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
|
||||||
ggml_element_size(Kcur) * n_embd_head,
|
|
||||||
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
|
||||||
0);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
Kcur = build_norm(Kcur,
|
Kcur = build_norm(Kcur,
|
||||||
model.layers[il].attn_k_norm,
|
model.layers[il].attn_k_norm,
|
||||||
model.layers[il].attn_k_norm_b,
|
model.layers[il].attn_k_norm_b,
|
||||||
|
|
@ -73,10 +55,6 @@ llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_gr
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
|
@ -94,7 +72,7 @@ llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_gr
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, nullptr,
|
model.layers[il].wo, nullptr, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,6 @@
|
||||||
|
|
||||||
llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
|
|
||||||
|
|
@ -30,37 +29,8 @@ llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur = nullptr;
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
ggml_tensor * Kcur = nullptr;
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
ggml_tensor * Vcur = nullptr;
|
|
||||||
|
|
||||||
if (model.layers[il].wqkv == nullptr) {
|
|
||||||
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
}
|
|
||||||
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
}
|
|
||||||
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
} else {
|
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
||||||
cb(cur, "wqkv", il);
|
|
||||||
if (model.layers[il].bqkv) {
|
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
||||||
cb(cur, "bqkv", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
||||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
||||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
||||||
}
|
|
||||||
|
|
||||||
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
|
|
@ -80,7 +50,7 @@ llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -111,8 +81,13 @@ llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
cb(inpL, "l_out", il);
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = build_norm(inpL,
|
cur = build_norm(inpL,
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
GGML_ASSERT(n_embd_head == n_rot);
|
GGML_ASSERT(n_embd_head == n_rot);
|
||||||
|
|
@ -28,15 +27,8 @@ llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_gr
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(cur, "wqkv", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
||||||
cb(cur, "bqkv", il);
|
|
||||||
|
|
||||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -55,7 +47,7 @@ llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_gr
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,18 +28,20 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
// get either the text or image weight tensors
|
// get either the text or image weight tensors
|
||||||
ggml_tensor *wqkv, *wo;
|
ggml_tensor *wqkv, *wo, *wo_s;
|
||||||
ggml_tensor *ffn_gate, *ffn_down, *ffn_up;
|
ggml_tensor *ffn_gate, *ffn_down, *ffn_up;
|
||||||
|
|
||||||
if (is_text) {
|
if (is_text) {
|
||||||
wqkv = model.layers[il].wqkv;
|
wqkv = model.layers[il].wqkv;
|
||||||
wo = model.layers[il].wo;
|
wo = model.layers[il].wo;
|
||||||
|
wo_s = model.layers[il].wo_s;
|
||||||
ffn_gate = model.layers[il].ffn_gate;
|
ffn_gate = model.layers[il].ffn_gate;
|
||||||
ffn_down = model.layers[il].ffn_down;
|
ffn_down = model.layers[il].ffn_down;
|
||||||
ffn_up = model.layers[il].ffn_up;
|
ffn_up = model.layers[il].ffn_up;
|
||||||
} else {
|
} else {
|
||||||
wqkv = model.layers[il].visexp_attn_wqkv;
|
wqkv = model.layers[il].visexp_attn_wqkv;
|
||||||
wo = model.layers[il].visexp_attn_wo;
|
wo = model.layers[il].visexp_attn_wo;
|
||||||
|
wo_s = nullptr;
|
||||||
ffn_gate = model.layers[il].visexp_ffn_gate;
|
ffn_gate = model.layers[il].visexp_ffn_gate;
|
||||||
ffn_down = model.layers[il].visexp_ffn_down;
|
ffn_down = model.layers[il].visexp_ffn_down;
|
||||||
ffn_up = model.layers[il].visexp_ffn_up;
|
ffn_up = model.layers[il].visexp_ffn_up;
|
||||||
|
|
@ -64,7 +66,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
|
||||||
Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type);
|
Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
wo, nullptr,
|
wo, nullptr, wo_s,
|
||||||
Qcur, Kcur, Vcur,
|
Qcur, Kcur, Vcur,
|
||||||
nullptr, nullptr, nullptr,
|
nullptr, nullptr, nullptr,
|
||||||
kq_scale, il);
|
kq_scale, il);
|
||||||
|
|
@ -86,6 +88,10 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -36,30 +36,8 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
if (is_swa) {
|
if (is_swa) {
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
|
|
@ -80,7 +58,7 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -32,27 +32,8 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
if (model.layers[il].attn_q_norm) {
|
if (model.layers[il].attn_q_norm) {
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM, il);
|
||||||
|
|
@ -73,7 +54,7 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
GGML_ASSERT(n_embd_head == n_rot);
|
GGML_ASSERT(n_embd_head == n_rot);
|
||||||
|
|
@ -30,19 +29,8 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur = nullptr;
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
ggml_tensor * Kcur = nullptr;
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
ggml_tensor * Vcur = nullptr;
|
|
||||||
|
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
||||||
cb(cur, "wqkv", il);
|
|
||||||
|
|
||||||
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
||||||
cb(cur, "wqkv_clamped", il);
|
|
||||||
|
|
||||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
||||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
||||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -61,7 +49,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
|
|
||||||
|
|
@ -47,27 +45,8 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
@ -80,7 +59,7 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -35,27 +35,8 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
@ -68,7 +49,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,9 @@
|
||||||
|
|
||||||
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
||||||
llm_graph_context(params) {
|
llm_graph_context(params) {
|
||||||
|
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||||
|
bool is_ocr = model.arch == LLM_ARCH_DEEPSEEK2OCR;
|
||||||
|
|
||||||
const bool is_mla = hparams.is_mla();
|
const bool is_mla = hparams.is_mla();
|
||||||
|
|
||||||
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
||||||
|
|
@ -54,7 +57,38 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||||
cb(cur, "attn_norm", il);
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
// self_attention
|
// self_attention
|
||||||
{
|
if (is_ocr) {
|
||||||
|
const int n_embed_head = hparams.n_embd / hparams.n_head();
|
||||||
|
const int ocr_rope_type = GGML_ROPE_TYPE_NEOX;
|
||||||
|
GGML_ASSERT(n_embed_head == n_embd_head_k && n_embed_head == n_embd_head_v);
|
||||||
|
|
||||||
|
ggml_tensor * Qcur = NULL;
|
||||||
|
ggml_tensor * Kcur = NULL;
|
||||||
|
ggml_tensor * Vcur = NULL;
|
||||||
|
|
||||||
|
Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Qcur, "q", il);
|
||||||
|
cb(Kcur, "k", il);
|
||||||
|
cb(Vcur, "v", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embed_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embed_head, n_head, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embed_head, n_head, n_tokens);
|
||||||
|
|
||||||
|
GGML_ASSERT(fabs(freq_base - 10000.0) < 1e-4);
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0);
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0);
|
||||||
|
cb(Qcur, "q_pe", il);
|
||||||
|
cb(Kcur, "k_pe", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn_kv,
|
||||||
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
|
cb(cur, "attn_out", il);
|
||||||
|
}
|
||||||
|
else {
|
||||||
ggml_tensor * q = NULL;
|
ggml_tensor * q = NULL;
|
||||||
|
|
||||||
const bool is_lite = model.layers[il].wq;
|
const bool is_lite = model.layers[il].wq;
|
||||||
|
|
@ -148,7 +182,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||||
|
|
||||||
// note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
|
// note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
|
||||||
cur = build_attn(inp_attn_k,
|
cur = build_attn(inp_attn_k,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
||||||
} else {
|
} else {
|
||||||
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
||||||
|
|
@ -185,7 +219,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||||
|
|
||||||
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
||||||
cur = build_attn(inp_attn_kv,
|
cur = build_attn(inp_attn_kv,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,18 +29,8 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
|
||||||
// self_attention
|
// self_attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Qcur, "Qcur_normed", il);
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
@ -59,7 +49,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
|
llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
|
||||||
llm_graph_context(params) {
|
llm_graph_context(params) {
|
||||||
//copied from qwen2
|
//copied from qwen2
|
||||||
|
|
@ -31,22 +29,8 @@ llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_para
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
@ -59,7 +43,7 @@ llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_para
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -30,27 +30,8 @@ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
@ -63,7 +44,7 @@ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,27 +29,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
|
||||||
}
|
}
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
@ -62,7 +43,7 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
|
|
|
||||||
|
|
@ -24,17 +24,8 @@ llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_grap
|
||||||
LLM_NORM_RMS, il);
|
LLM_NORM_RMS, il);
|
||||||
|
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur;
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
ggml_tensor * Kcur;
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
ggml_tensor * Vcur;
|
|
||||||
|
|
||||||
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
||||||
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -53,7 +44,7 @@ llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_grap
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, nullptr,
|
model.layers[il].wo, nullptr, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
@ -82,6 +73,7 @@ llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_grap
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
|
|
|
||||||
|
|
@ -35,18 +35,8 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
|
@ -65,7 +55,7 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn_iswa,
|
cur = build_attn(inp_attn_iswa,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
|
llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
|
||||||
llm_graph_context(params) {
|
llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
|
|
@ -34,27 +32,8 @@ llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_pa
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
@ -67,7 +46,7 @@ llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_pa
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
template <bool iswa>
|
template <bool iswa>
|
||||||
llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
|
llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
|
||||||
llm_graph_context(params) {
|
llm_graph_context(params) {
|
||||||
|
|
@ -39,18 +38,8 @@ llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_
|
||||||
{
|
{
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
|
@ -69,7 +58,7 @@ llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -27,19 +27,8 @@ llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_gr
|
||||||
cb(cur, "attn_norm", il);
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
@ -52,7 +41,7 @@ llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_gr
|
||||||
cb(Vcur, "Vcur-post-rope", il);
|
cb(Vcur, "Vcur-post-rope", il);
|
||||||
|
|
||||||
ggml_tensor * attn_out = build_attn(inp->get_attn(),
|
ggml_tensor * attn_out = build_attn(inp->get_attn(),
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(attn_out, "attn_out", il);
|
cb(attn_out, "attn_out", il);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,7 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
GGML_ASSERT(n_embd_head == n_rot);
|
GGML_ASSERT(n_embd_head == n_rot);
|
||||||
|
|
@ -42,12 +40,8 @@ llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_pa
|
||||||
cur = attn_norm;
|
cur = attn_norm;
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(cur, "wqkv", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
||||||
|
|
||||||
// using mode = 2 for neox mode
|
// using mode = 2 for neox mode
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
|
|
@ -67,7 +61,7 @@ llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_pa
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
|
||||||
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
||||||
cb(inpL, "inp_scaled", -1);
|
cb(inpL, "inp_scaled", -1);
|
||||||
|
|
||||||
|
|
@ -31,18 +31,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Qcur, "Qcur_normed", il);
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
@ -65,7 +55,7 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
|
||||||
|
|
||||||
cur =
|
cur =
|
||||||
build_attn(inp_attn,
|
build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
|
|
||||||
|
|
@ -29,18 +28,8 @@ llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_para
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -60,7 +49,7 @@ llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_para
|
||||||
cb(Qcur, "Qcur_scaled", il);
|
cb(Qcur, "Qcur_scaled", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -31,18 +31,8 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -61,7 +51,7 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
||||||
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
|
||||||
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
||||||
cb(inpL, "inp_scaled", -1);
|
cb(inpL, "inp_scaled", -1);
|
||||||
|
|
||||||
|
|
@ -47,18 +47,8 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Qcur, "Qcur_normed", il);
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
@ -84,7 +74,7 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
|
||||||
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,12 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
|
||||||
|
static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, int idx) {
|
||||||
|
GGML_ASSERT(idx < (int) x->ne[2]);
|
||||||
|
return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], ggml_row_size(x->type, x->ne[0]),
|
||||||
|
idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
|
||||||
|
}
|
||||||
|
|
||||||
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
|
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
|
||||||
llm_graph_context(params),
|
llm_graph_context(params),
|
||||||
model(model),
|
model(model),
|
||||||
|
|
@ -12,7 +19,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
|
||||||
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
||||||
cb(inpL, "inp_scaled", -1);
|
cb(inpL, "inp_scaled", -1);
|
||||||
|
|
||||||
|
|
@ -22,8 +29,11 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
// TODO: is causal == true correct? might need some changes
|
// TODO: is causal == true correct? might need some changes
|
||||||
auto * inp_attn = build_attn_inp_kv_iswa();
|
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||||
|
|
||||||
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
|
ggml_tensor * inp_per_layer = build_inp_per_layer();
|
||||||
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
ggml_build_forward_expand(gf, inp_per_layer);
|
||||||
|
|
||||||
|
// inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
|
||||||
|
inp_per_layer = project_per_layer_inputs(inpL, inp_per_layer);
|
||||||
|
|
||||||
// inpL now has only 1 altup, project it to the rest of the altups
|
// inpL now has only 1 altup, project it to the rest of the altups
|
||||||
// these "added" altups will be concat to the last dim of inpL
|
// these "added" altups will be concat to the last dim of inpL
|
||||||
|
|
@ -37,8 +47,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
|
inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
|
||||||
cb(inpL, "inp_stacked", -1);
|
cb(inpL, "inp_stacked", -1);
|
||||||
}
|
}
|
||||||
// inpL now has shape: [n_embd, n_tokens, n_altup]
|
// inpL now has shape: [n_embd, n_tokens, n_altup]
|
||||||
// inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code
|
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code
|
||||||
|
|
@ -49,8 +58,8 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
|
ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
|
||||||
|
|
||||||
// predicted value will go through self-attention and laurel
|
// predicted value will go through self-attention and laurel
|
||||||
ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
|
ggml_tensor * active_prediction = ggml_view_2d_slice(ctx0, predictions, i_altup_act); // [n_embd, n_tokens]
|
||||||
cur = active_prediction;
|
cur = active_prediction;
|
||||||
cb(cur, "active_prediction", il);
|
cb(cur, "active_prediction", il);
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
|
|
@ -62,19 +71,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
if (hparams.has_kv(il)) {
|
if (hparams.has_kv(il)) {
|
||||||
// compute Q and K and RoPE them
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_embd_head, n_head, n_head_kv, il);
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
|
@ -94,7 +91,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
cb(Kcur, "Kcur_pos", il);
|
cb(Kcur, "Kcur_pos", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn, model.layers[il].wo,
|
cur = build_attn(inp_attn, model.layers[il].wo,
|
||||||
NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
|
NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
|
||||||
hparams.f_attention_scale, il);
|
hparams.f_attention_scale, il);
|
||||||
} else {
|
} else {
|
||||||
// reuse KV cache of earlier layers
|
// reuse KV cache of earlier layers
|
||||||
|
|
@ -110,7 +107,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
cb(Qcur, "Qcur_pos", il);
|
cb(Qcur, "Qcur_pos", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
||||||
}
|
}
|
||||||
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
|
@ -151,12 +148,13 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
|
|
||||||
ggml_tensor * first_prediction; // [n_embd, n_tokens]
|
ggml_tensor * first_prediction; // [n_embd, n_tokens]
|
||||||
{
|
{
|
||||||
first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
|
first_prediction = ggml_view_2d_slice(ctx0, corrected, i_altup_act); // [n_embd, n_tokens]
|
||||||
first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
|
first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
|
||||||
first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
|
first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
|
||||||
first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
|
first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
|
||||||
cb(first_prediction, "first_prediction_gated", il);
|
cb(first_prediction, "first_prediction_gated", il);
|
||||||
ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
|
|
||||||
|
ggml_tensor * inp_this_layer = ggml_view_2d_slice(ctx0, inp_per_layer, il); // [n_embd_altup, n_tokens]
|
||||||
first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
|
first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
|
||||||
cb(first_prediction, "first_prediction_scaled", il);
|
cb(first_prediction, "first_prediction_scaled", il);
|
||||||
|
|
||||||
|
|
@ -167,7 +165,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
}
|
}
|
||||||
// equivalent to python code: corrected_predictions[1:] += first_prediction
|
// equivalent to python code: corrected_predictions[1:] += first_prediction
|
||||||
{
|
{
|
||||||
ggml_tensor * slice_first = view_2d_slice(corrected, 0);
|
ggml_tensor * slice_first = ggml_view_2d_slice(ctx0, corrected, 0);
|
||||||
ggml_tensor * slice_rest = ggml_view_3d(
|
ggml_tensor * slice_rest = ggml_view_3d(
|
||||||
ctx0, corrected, n_embd, n_tokens, n_altup - 1, ggml_row_size(corrected->type, n_embd),
|
ctx0, corrected, n_embd, n_tokens, n_altup - 1, ggml_row_size(corrected->type, n_embd),
|
||||||
ggml_row_size(corrected->type, n_embd * n_tokens), n_embd * n_tokens * ggml_element_size(corrected));
|
ggml_row_size(corrected->type, n_embd * n_tokens), n_embd * n_tokens * ggml_element_size(corrected));
|
||||||
|
|
@ -185,7 +183,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
|
|
||||||
// cur now has multiple altup(s), we want to merge them back to 1 altup
|
// cur now has multiple altup(s), we want to merge them back to 1 altup
|
||||||
{
|
{
|
||||||
ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
|
ggml_tensor * target_magnitude = calc_magnitude(ggml_view_2d_slice(ctx0, cur, i_altup_act)); // [n_embd, n_tokens]
|
||||||
// do a view to skip the first slice (active altup)
|
// do a view to skip the first slice (active altup)
|
||||||
ggml_tensor * alt_slice =
|
ggml_tensor * alt_slice =
|
||||||
ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1, ggml_row_size(cur->type, n_embd),
|
ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1, ggml_row_size(cur->type, n_embd),
|
||||||
|
|
@ -197,9 +195,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||||
cb(altup_unembd, "altup_unembd", -1);
|
cb(altup_unembd, "altup_unembd", -1);
|
||||||
|
|
||||||
// equivalent to torch.mean(hidden_states, dim=0)
|
// equivalent to torch.mean(hidden_states, dim=0)
|
||||||
cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
|
cur = ggml_view_2d_slice(ctx0, cur, 0); // [n_embd, n_tokens]
|
||||||
for (int i = 0; i < n_altup - 1; ++i) {
|
for (int i = 0; i < n_altup - 1; ++i) {
|
||||||
cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
|
cur = ggml_add(ctx0, cur, ggml_view_2d_slice(ctx0, altup_unembd, i));
|
||||||
}
|
}
|
||||||
cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
|
cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
|
||||||
cb(cur, "unembd_merged", -1);
|
cb(cur, "unembd_merged", -1);
|
||||||
|
|
@ -235,39 +233,34 @@ ggml_tensor * llm_build_gemma3n_iswa::calc_magnitude(ggml_tensor * x) {
|
||||||
return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
|
return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
|
|
||||||
ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
|
|
||||||
GGML_ASSERT(idx < (int) x->ne[2]);
|
|
||||||
return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], ggml_row_size(x->type, x->ne[0]),
|
|
||||||
idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
|
|
||||||
}
|
|
||||||
|
|
||||||
// equivalent to get_per_layer_inputs() in python code
|
// equivalent to get_per_layer_inputs() in python code
|
||||||
// output shape: [n_embd_altup, n_layer, n_tokens]
|
// output shape: [n_embd_altup, n_layer, n_tokens]
|
||||||
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
ggml_tensor * llm_build_gemma3n_iswa::build_inp_per_layer() {
|
||||||
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
|
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
|
||||||
ggml_tensor * inp_per_layer;
|
ggml_tensor * inp_per_layer;
|
||||||
|
float tok_embd_scale = sqrtf((float) n_embd_altup);
|
||||||
if (ubatch.token) {
|
if (ubatch.token) {
|
||||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||||
ggml_set_input(inp->tokens);
|
ggml_set_input(inp->tokens);
|
||||||
res->t_inp_tokens = inp->tokens;
|
res->t_inp_tokens = inp->tokens;
|
||||||
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
|
inp_per_layer = ggml_get_rows (ctx0, model.per_layer_tok_embd, inp->tokens);
|
||||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
||||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
|
inp_per_layer = ggml_scale (ctx0, inp_per_layer, tok_embd_scale);
|
||||||
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
||||||
res->add_input(std::move(inp));
|
res->add_input(std::move(inp));
|
||||||
} else {
|
} else {
|
||||||
// Vision embedding path: use padding token (ID=0) embedding
|
// Multimodal embedding path: use padding token (ID=0) embedding
|
||||||
// TODO: verify if this is the correct behavior in transformers implementation
|
// TODO: verify if this is the correct behavior in transformers implementation
|
||||||
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
|
const int64_t embd_size = model.per_layer_tok_embd->ne[0]; // n_embd_altup * n_layer
|
||||||
|
|
||||||
// Extract and dequantize padding token embedding (row 0)
|
// Extract and dequantize padding token embedding (row 0)
|
||||||
ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
|
ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0);
|
||||||
inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
|
inp_per_layer = ggml_cast (ctx0, padding, GGML_TYPE_F32);
|
||||||
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, tok_embd_scale);
|
||||||
|
|
||||||
// Reshape to [n_embd_altup, n_layer, 1]
|
// Reshape to [n_embd_altup, n_layer, 1]
|
||||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
|
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
|
||||||
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
cb(inp_per_layer, "inp_per_layer_multimodal", -1);
|
||||||
}
|
}
|
||||||
return inp_per_layer;
|
return inp_per_layer;
|
||||||
}
|
}
|
||||||
|
|
@ -275,18 +268,19 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
||||||
// equivalent to project_per_layer_inputs() in python code
|
// equivalent to project_per_layer_inputs() in python code
|
||||||
// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
|
// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
|
||||||
// output shape: [n_embd_altup, n_tokens, n_layer]
|
// output shape: [n_embd_altup, n_tokens, n_layer]
|
||||||
ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
|
ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer) {
|
||||||
const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
|
const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
|
||||||
const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
|
const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
|
||||||
|
|
||||||
ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
|
ggml_tensor * per_layer_proj;
|
||||||
per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
|
per_layer_proj = ggml_mul_mat (ctx0, model.per_layer_model_proj, inp_batch);
|
||||||
per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
|
per_layer_proj = ggml_scale (ctx0, per_layer_proj, per_layer_projection_scale);
|
||||||
per_layer_proj = build_norm(per_layer_proj, model.per_layer_proj_norm, NULL, LLM_NORM_RMS,
|
per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
|
||||||
-1); // [n_embd_altup, n_layer, n_tokens]
|
|
||||||
|
per_layer_proj = build_norm(per_layer_proj, model.per_layer_proj_norm, NULL, LLM_NORM_RMS, -1);
|
||||||
cb(per_layer_proj, "per_layer_proj", -1);
|
cb(per_layer_proj, "per_layer_proj", -1);
|
||||||
|
|
||||||
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
|
inp_per_layer = ggml_add (ctx0, per_layer_proj, inp_per_layer);
|
||||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
||||||
cb(inp_per_layer, "inp_per_layer", -1);
|
cb(inp_per_layer, "inp_per_layer", -1);
|
||||||
|
|
||||||
|
|
@ -337,7 +331,7 @@ ggml_tensor * llm_build_gemma3n_iswa::altup_compute_router_modalities(ggml_tenso
|
||||||
// input cur shape: [n_embd, n_tokens, n_altup]
|
// input cur shape: [n_embd, n_tokens, n_altup]
|
||||||
// output shape: [n_embd, n_tokens, n_altup]
|
// output shape: [n_embd, n_tokens, n_altup]
|
||||||
ggml_tensor * llm_build_gemma3n_iswa::altup_predict(ggml_tensor * cur, int il) {
|
ggml_tensor * llm_build_gemma3n_iswa::altup_predict(ggml_tensor * cur, int il) {
|
||||||
ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
|
ggml_tensor * activated = ggml_view_2d_slice(ctx0, cur, i_altup_act); // [n_embd, n_tokens]
|
||||||
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
||||||
cb(modalities, "modalities", il);
|
cb(modalities, "modalities", il);
|
||||||
|
|
||||||
|
|
@ -365,7 +359,7 @@ ggml_tensor * llm_build_gemma3n_iswa::altup_correct(ggml_tensor * predictions, g
|
||||||
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
||||||
cb(modalities, "modalities", il);
|
cb(modalities, "modalities", il);
|
||||||
|
|
||||||
ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
|
ggml_tensor * active_prediction = ggml_view_2d_slice(ctx0, predictions, i_altup_act);
|
||||||
ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
|
ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
|
||||||
cb(innovation, "innovation", il);
|
cb(innovation, "innovation", il);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,322 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
|
||||||
|
static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, int idx) {
|
||||||
|
GGML_ASSERT(idx < (int) x->ne[2]);
|
||||||
|
return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], ggml_row_size(x->type, x->ne[0]),
|
||||||
|
idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const llm_graph_params & params) :
|
||||||
|
llm_graph_context(params),
|
||||||
|
model(model),
|
||||||
|
n_embd_per_layer(model.hparams.n_embd_per_layer) {
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
||||||
|
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
||||||
|
cb(inpL, "inp_scaled", -1);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// TODO: is causal == true correct? might need some changes
|
||||||
|
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||||
|
|
||||||
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
|
ggml_tensor * inp_per_layer = nullptr;
|
||||||
|
if (model.per_layer_tok_embd) {
|
||||||
|
inp_per_layer = build_inp_per_layer();
|
||||||
|
ggml_build_forward_expand(gf, inp_per_layer);
|
||||||
|
|
||||||
|
// inp_per_layer shape: [n_embd_per_layer, n_tokens, n_layer]
|
||||||
|
inp_per_layer = project_per_layer_inputs(inpL, inp_per_layer);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_k(il);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v(il));
|
||||||
|
|
||||||
|
const int64_t n_head = hparams.n_head(il);
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||||
|
|
||||||
|
const float freq_base_l = model.get_rope_freq_base(cparams, il);
|
||||||
|
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||||
|
const int n_rot_l = hparams.n_rot(il);
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
ggml_tensor * freq_factors = nullptr;
|
||||||
|
if (!hparams.is_swa(il)) {
|
||||||
|
// full_attention layers use rope_freqs for proportional rope
|
||||||
|
freq_factors = model.layers[il].rope_freqs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Q projection (shared for both non-KV and KV layers)
|
||||||
|
// this is to mirror Gemma4Attention in pytorch code
|
||||||
|
ggml_tensor * Qcur;
|
||||||
|
{
|
||||||
|
Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
|
||||||
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||||
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
cb(Qcur, "Qcur_pos", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
if (hparams.has_kv(il)) {
|
||||||
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
ggml_tensor * Vcur = model.layers[il].wv
|
||||||
|
? build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s)
|
||||||
|
: Kcur; // if v_proj is not present, use Kcur as Vcur
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
|
||||||
|
Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
cb(Kcur, "Kcur_normed", il);
|
||||||
|
cb(Vcur, "Vcur_normed", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
cb(Kcur, "Kcur_pos", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn, model.layers[il].wo,
|
||||||
|
nullptr, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
|
||||||
|
hparams.f_attention_scale, il);
|
||||||
|
} else {
|
||||||
|
// reuse KV cache of earlier layers
|
||||||
|
cur = build_attn(inp_attn,
|
||||||
|
model.layers[il].wo, nullptr, model.layers[il].wo_s,
|
||||||
|
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO @ngxson : strip unused token right after the last KV layer to speed up prompt processing
|
||||||
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
}
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.layers[il].attn_post_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_post_norm", il);
|
||||||
|
|
||||||
|
ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL);
|
||||||
|
cb(attn_out, "attn_out", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
const bool is_moe_layer = model.layers[il].ffn_gate_inp != nullptr;
|
||||||
|
if (is_moe_layer) {
|
||||||
|
// MLP (shared exp)
|
||||||
|
ggml_tensor * cur_mlp = build_norm(attn_out,
|
||||||
|
model.layers[il].ffn_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur_mlp, "ffn_norm_1", il);
|
||||||
|
|
||||||
|
cur_mlp = build_ffn(cur_mlp,
|
||||||
|
model.layers[il].ffn_up, nullptr, model.layers[il].ffn_up_s,
|
||||||
|
model.layers[il].ffn_gate, nullptr, model.layers[il].ffn_gate_s,
|
||||||
|
model.layers[il].ffn_down, nullptr, model.layers[il].ffn_down_s,
|
||||||
|
nullptr,
|
||||||
|
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||||
|
cur_mlp = build_norm(cur_mlp,
|
||||||
|
model.layers[il].ffn_post_norm_1, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur_mlp, "ffn_mlp", il);
|
||||||
|
|
||||||
|
// Expert FFN
|
||||||
|
ggml_tensor * cur_moe = build_norm(attn_out,
|
||||||
|
model.layers[il].ffn_pre_norm_2, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur_moe, "ffn_norm_2", il);
|
||||||
|
|
||||||
|
// custom MoE logits calculation (router operates on attn_out, not cur)
|
||||||
|
ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps);
|
||||||
|
tmp = ggml_scale(ctx0, tmp, 1.0f / sqrtf((float) n_embd));
|
||||||
|
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s);
|
||||||
|
ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, tmp); // [n_expert, n_tokens]
|
||||||
|
cb(logits, "ffn_moe_logits", il);
|
||||||
|
|
||||||
|
cur_moe = build_moe_ffn(cur_moe,
|
||||||
|
nullptr, // gate_inp
|
||||||
|
nullptr, // up_exps
|
||||||
|
nullptr, // gate_exps
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr, // exp_probs_b (not used for gemma4)
|
||||||
|
n_expert, n_expert_used,
|
||||||
|
LLM_FFN_GELU, true,
|
||||||
|
1.0f,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
|
il, logits,
|
||||||
|
model.layers[il].ffn_gate_up_exps,
|
||||||
|
nullptr, // up_exps_s
|
||||||
|
nullptr, // gate_exps_s
|
||||||
|
model.layers[il].ffn_down_exps_s);
|
||||||
|
cur_moe = build_norm(cur_moe,
|
||||||
|
model.layers[il].ffn_post_norm_2, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur_moe, "ffn_moe", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur_mlp, cur_moe);
|
||||||
|
cb(cur, "ffn_moe_combined", il);
|
||||||
|
} else {
|
||||||
|
cur = build_norm(attn_out,
|
||||||
|
model.layers[il].ffn_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up, nullptr, model.layers[il].ffn_up_s,
|
||||||
|
model.layers[il].ffn_gate, nullptr, model.layers[il].ffn_gate_s,
|
||||||
|
model.layers[il].ffn_down, nullptr, model.layers[il].ffn_down_s,
|
||||||
|
nullptr,
|
||||||
|
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.layers[il].ffn_post_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
cb(cur, "ffn_post_norm", il);
|
||||||
|
|
||||||
|
// residual connection
|
||||||
|
cur = ggml_add(ctx0, cur, attn_out);
|
||||||
|
|
||||||
|
// per-layer embedding
|
||||||
|
if (inp_per_layer) {
|
||||||
|
ggml_tensor * pe_in = cur;
|
||||||
|
cb(cur, "pe_in", il);
|
||||||
|
|
||||||
|
cur = build_lora_mm(model.layers[il].per_layer_inp_gate, cur); // [n_embd_per_layer, n_tokens]
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
|
||||||
|
ggml_tensor * inp_this_layer = ggml_view_2d_slice(ctx0, inp_per_layer, il); // [n_embd_per_layer, n_tokens]
|
||||||
|
|
||||||
|
// TODO @ngxson : improve this
|
||||||
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_mul(ctx0, cur, inp_this_layer);
|
||||||
|
cur = build_lora_mm(model.layers[il].per_layer_proj, cur); // [n_embd, n_tokens]
|
||||||
|
cur = build_norm(cur, model.layers[il].per_layer_post_norm, nullptr, LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "per_layer_embd_out", il);
|
||||||
|
|
||||||
|
// residual connection
|
||||||
|
cur = ggml_add(ctx0, pe_in, cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
// layer_scalar
|
||||||
|
if (model.layers[il].out_scale) {
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
|
||||||
|
cb(cur, "out_scaled", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.output_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
|
||||||
|
if (hparams.f_final_logit_softcapping) {
|
||||||
|
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||||
|
cur = ggml_tanh(ctx0, cur);
|
||||||
|
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
res->t_logits = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
// equivalent to get_per_layer_inputs() in python code
|
||||||
|
// output shape: [n_embd_per_layer, n_layer, n_tokens]
|
||||||
|
ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() {
|
||||||
|
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
|
||||||
|
|
||||||
|
ggml_tensor * inp_per_layer;
|
||||||
|
float tok_embd_scale = sqrtf((float) n_embd_per_layer);
|
||||||
|
if (ubatch.token) {
|
||||||
|
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||||
|
ggml_set_input(inp->tokens);
|
||||||
|
res->t_inp_tokens = inp->tokens;
|
||||||
|
|
||||||
|
inp_per_layer = ggml_get_rows (ctx0, model.per_layer_tok_embd, inp->tokens);
|
||||||
|
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, n_tokens);
|
||||||
|
inp_per_layer = ggml_scale (ctx0, inp_per_layer, tok_embd_scale);
|
||||||
|
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
||||||
|
|
||||||
|
res->add_input(std::move(inp));
|
||||||
|
} else {
|
||||||
|
// Multimodal embedding path: use padding token (ID=0) embedding
|
||||||
|
// TODO: verify if this is the correct behavior in transformers implementation
|
||||||
|
const int64_t embd_size = model.per_layer_tok_embd->ne[0]; // n_embd_per_layer * n_layer
|
||||||
|
|
||||||
|
// Extract and dequantize padding token embedding (row 0)
|
||||||
|
ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0);
|
||||||
|
inp_per_layer = ggml_cast (ctx0, padding, GGML_TYPE_F32);
|
||||||
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, tok_embd_scale);
|
||||||
|
|
||||||
|
// Reshape to [n_embd_per_layer, n_layer, 1]
|
||||||
|
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, 1);
|
||||||
|
cb(inp_per_layer, "inp_per_layer_multimodal", -1);
|
||||||
|
}
|
||||||
|
return inp_per_layer;
|
||||||
|
}
|
||||||
|
|
||||||
|
// equivalent to project_per_layer_inputs() in python code
|
||||||
|
// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
|
||||||
|
// inp_batch shape: [n_embd, n_tokens]
|
||||||
|
// inp_per_layer shape: [n_embd_per_layer, n_layer, n_tokens] (from build_inp_per_layer)
|
||||||
|
// output shape: [n_embd_per_layer, n_tokens, n_layer]
|
||||||
|
ggml_tensor * llm_build_gemma4_iswa::project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer) {
|
||||||
|
const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
|
||||||
|
const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
|
||||||
|
|
||||||
|
// note: this matrix multiplication will be performed in the input layer (i.e. on the CPU)
|
||||||
|
ggml_tensor * per_layer_proj;
|
||||||
|
per_layer_proj = ggml_mul_mat (ctx0, model.per_layer_model_proj, inp_batch);
|
||||||
|
per_layer_proj = ggml_scale (ctx0, per_layer_proj, per_layer_projection_scale);
|
||||||
|
per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_per_layer, n_layer, n_tokens);
|
||||||
|
|
||||||
|
per_layer_proj = build_norm(per_layer_proj, model.per_layer_proj_norm, nullptr, LLM_NORM_RMS, -1);
|
||||||
|
cb(per_layer_proj, "per_layer_proj", -1);
|
||||||
|
|
||||||
|
inp_per_layer = ggml_add (ctx0, per_layer_proj, inp_per_layer);
|
||||||
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
||||||
|
cb(inp_per_layer, "inp_per_layer", -1);
|
||||||
|
|
||||||
|
// permute to shape: [n_embd_per_layer, n_tokens, n_layer]
|
||||||
|
inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
|
||||||
|
return inp_per_layer;
|
||||||
|
}
|
||||||
|
|
@ -38,27 +38,8 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
if (model.layers[il].bq) {
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
}
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
}
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
}
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
// Apply Q/K norm if available (GLM-4.5 355B variant)
|
// Apply Q/K norm if available (GLM-4.5 355B variant)
|
||||||
if (model.layers[il].attn_q_norm) {
|
if (model.layers[il].attn_q_norm) {
|
||||||
|
|
@ -94,7 +75,7 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,7 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
|
|
||||||
|
|
@ -41,40 +38,8 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur = nullptr;
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
ggml_tensor * Kcur = nullptr;
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
ggml_tensor * Vcur = nullptr;
|
|
||||||
|
|
||||||
if (model.layers[il].wqkv == nullptr) {
|
|
||||||
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
}
|
|
||||||
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
}
|
|
||||||
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
} else {
|
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
||||||
cb(cur, "wqkv", il);
|
|
||||||
if (model.layers[il].bqkv) {
|
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
||||||
cb(cur, "bqkv", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
|
|
||||||
0 * sizeof(float) * (n_embd));
|
|
||||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
|
||||||
cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
||||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
|
||||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (use_mrope) {
|
if (use_mrope) {
|
||||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -100,7 +65,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
|
|
||||||
|
|
@ -34,22 +33,11 @@ llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(cur, "wqkv", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
||||||
cb(cur, "bqkv", il);
|
|
||||||
|
|
||||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,7 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
|
|
||||||
|
|
@ -28,15 +26,8 @@ llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(cur, "wqkv", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
||||||
cb(cur, "bqkv", il);
|
|
||||||
|
|
||||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
||||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -55,7 +46,7 @@ llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -73,31 +73,7 @@ ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
const int64_t n_embd_head,
|
const int64_t n_embd_head,
|
||||||
const int il) {
|
const int il) {
|
||||||
// compute Q and K and (optionally) RoPE them
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_embd_head, hparams.n_head(il), hparams.n_head_kv(il), il);
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
||||||
|
|
||||||
const bool use_rope = hparams.rope_finetuned;
|
const bool use_rope = hparams.rope_finetuned;
|
||||||
if (use_rope) {
|
if (use_rope) {
|
||||||
|
|
@ -116,7 +92,7 @@ ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *
|
||||||
const float kq_scale =
|
const float kq_scale =
|
||||||
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
return cur;
|
return cur;
|
||||||
|
|
|
||||||
|
|
@ -76,31 +76,8 @@ ggml_tensor * llm_build_granite::build_attention_layer(
|
||||||
const int64_t n_embd_head,
|
const int64_t n_embd_head,
|
||||||
const int il) {
|
const int il) {
|
||||||
|
|
||||||
// compute Q and K and (optionally) RoPE them
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
n_embd_head, hparams.n_head(il), hparams.n_head_kv(il), il);
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
||||||
|
|
||||||
const bool use_rope = hparams.rope_finetuned;
|
const bool use_rope = hparams.rope_finetuned;
|
||||||
if (use_rope) {
|
if (use_rope) {
|
||||||
|
|
@ -124,7 +101,7 @@ ggml_tensor * llm_build_granite::build_attention_layer(
|
||||||
|
|
||||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
return cur;
|
return cur;
|
||||||
|
|
|
||||||
|
|
@ -30,27 +30,8 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -69,7 +50,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -30,18 +30,8 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
|
||||||
// self_attention
|
// self_attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Qcur, "Qcur_normed", il);
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
@ -60,7 +50,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,11 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
GGML_ASSERT(n_embd_head == n_rot);
|
GGML_ASSERT(n_embd_head == n_rot);
|
||||||
|
|
||||||
|
const bool use_mrope = hparams.use_mrope();
|
||||||
|
|
||||||
|
int sections[4];
|
||||||
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||||
|
|
||||||
ggml_tensor * cur;
|
ggml_tensor * cur;
|
||||||
ggml_tensor * inpL;
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
|
@ -34,44 +39,39 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
if (use_mrope) {
|
||||||
ctx0, Qcur, inp_pos, rope_factors,
|
Qcur = ggml_rope_multi(
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
);
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_multi(
|
||||||
|
ctx0, Kcur, inp_pos, rope_factors,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, Kcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
|
||||||
ctx0, Kcur, inp_pos, rope_factors,
|
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
|
||||||
|
|
||||||
Kcur = build_norm(Kcur,
|
Kcur = build_norm(Kcur,
|
||||||
model.layers[il].attn_k_norm, nullptr,
|
model.layers[il].attn_k_norm, nullptr,
|
||||||
LLM_NORM_RMS, il);
|
LLM_NORM_RMS, il);
|
||||||
|
|
@ -83,7 +83,7 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
|
||||||
cb(Qcur, "Qcur_norm", il);
|
cb(Qcur, "Qcur_norm", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -35,27 +35,8 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
|
||||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, rope_factors,
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
|
|
@ -84,7 +65,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
|
||||||
cb(Qcur, "Qcur_norm", il);
|
cb(Qcur, "Qcur_norm", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -30,27 +30,8 @@ llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_gr
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
|
@ -69,7 +50,7 @@ llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_gr
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||||
|
|
||||||
|
|
@ -24,22 +23,11 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(cur, "wqkv", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
||||||
cb(cur, "bqkv", il);
|
|
||||||
|
|
||||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
|
|
||||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
|
|
||||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
@ -66,8 +54,14 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
|
||||||
cb(inpL, "l_out", il);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
}
|
}
|
||||||
cur = build_norm(inpL,
|
cur = build_norm(inpL,
|
||||||
model.output_norm,
|
model.output_norm,
|
||||||
|
|
|
||||||
|
|
@ -31,25 +31,8 @@ llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_para
|
||||||
|
|
||||||
// Self-attention with separate Q, K, V projections
|
// Self-attention with separate Q, K, V projections
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur_bias", il);
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur_bias", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur_bias", il);
|
|
||||||
|
|
||||||
// Reshape for attention
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
// Apply RoPE
|
// Apply RoPE
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
|
|
@ -68,7 +51,7 @@ llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_para
|
||||||
cb(Kcur, "Kcur_rope", il);
|
cb(Kcur, "Kcur_rope", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,25 +24,12 @@ llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_para
|
||||||
} else {
|
} else {
|
||||||
// Attention
|
// Attention
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
// No RoPE :)
|
// No RoPE :)
|
||||||
cur = build_attn(inp_hybrid->get_attn(),
|
cur = build_attn(inp_hybrid->get_attn(),
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -268,7 +268,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
||||||
ggml_tensor * Vcur = kv_cmpr;
|
ggml_tensor * Vcur = kv_cmpr;
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn_k, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il);
|
cur = build_attn(inp_attn_k, layer.wo, NULL, layer.wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il);
|
||||||
cb(cur, "mla_out", il);
|
cb(cur, "mla_out", il);
|
||||||
} else { // MLA KV cache disabled. Fall back to MHA KV cache.
|
} else { // MLA KV cache disabled. Fall back to MHA KV cache.
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens);
|
||||||
|
|
@ -299,7 +299,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
||||||
|
|
||||||
// Direct softmax attention (with MHA KV cache)
|
// Direct softmax attention (with MHA KV cache)
|
||||||
// Use build_attn with inp_attn for proper mask handling
|
// Use build_attn with inp_attn for proper mask handling
|
||||||
cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
|
cur = build_attn(inp_attn_kv, layer.wo, NULL, layer.wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
|
||||||
cb(cur, "mla_out", il);
|
cb(cur, "mla_out", il);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -362,6 +362,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
||||||
cur = build_cvec(cur, il);
|
cur = build_cvec(cur, il);
|
||||||
cb(cur, "l_out", il);
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
|
|
|
||||||
|
|
@ -42,16 +42,8 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
|
||||||
const auto n_embd_head = hparams.n_embd_head_v();
|
const auto n_embd_head = hparams.n_embd_head_v();
|
||||||
const auto n_head_kv = hparams.n_head_kv(il);
|
const auto n_head_kv = hparams.n_head_kv(il);
|
||||||
|
|
||||||
auto * q = build_lora_mm(model.layers[il].wq, cur);
|
auto [q, k, v] = build_qkv(model.layers[il], cur,
|
||||||
cb(q, "model.layers.{}.self_attn.q_proj", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
auto * k = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(k, "model.layers.{}.self_attn.k_proj", il);
|
|
||||||
auto * v = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(v, "model.layers.{}.self_attn.v_proj", il);
|
|
||||||
|
|
||||||
q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
|
|
||||||
k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
// qk norm
|
// qk norm
|
||||||
q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
|
@ -66,7 +58,7 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
|
||||||
attn_factor, beta_fast, beta_slow);
|
attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
|
|
||||||
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
||||||
|
|
@ -177,6 +169,9 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
|
||||||
cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
|
cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_out);
|
cur = ggml_add(ctx0, cur, ffn_out);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||||
|
|
|
||||||
|
|
@ -30,18 +30,8 @@ llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_gr
|
||||||
// self_attention
|
// self_attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
cb(Qcur, "Qcur", il);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Qcur, "Qcur_normed", il);
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
@ -66,7 +56,7 @@ llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_gr
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
|
|
@ -30,17 +30,8 @@ llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_para
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
// compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
n_embd_head, n_head, n_head_kv, il);
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
@ -53,7 +44,7 @@ llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_para
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue