Compare commits
60 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
fc674574ca | |
|
|
166c20b473 | |
|
|
95ea8f9bfb | |
|
|
759f0084b4 | |
|
|
52699f6d19 | |
|
|
7f466e237b | |
|
|
b564a99ed6 | |
|
|
45a7083431 | |
|
|
1848f994e3 | |
|
|
07237ff99e | |
|
|
eb747f3def | |
|
|
a050c7d1bf | |
|
|
495b77aec2 | |
|
|
3987857d2d | |
|
|
f2a8e65ea7 | |
|
|
9e4e4c2401 | |
|
|
eef7422d4d | |
|
|
116a9f6ab7 | |
|
|
624be93425 | |
|
|
37c0a52c1b | |
|
|
c589dd77d4 | |
|
|
54f5c02f29 | |
|
|
a0e41ec261 | |
|
|
c976b22d7b | |
|
|
607c92430f | |
|
|
1d0f0285de | |
|
|
69f0d907ee | |
|
|
77b635e9c4 | |
|
|
5f3428219a | |
|
|
22710fdb82 | |
|
|
ca5d565dcd | |
|
|
49b505bcc5 | |
|
|
46dcb35aa3 | |
|
|
65d820a44a | |
|
|
e1cdce46c5 | |
|
|
15f6b6ad76 | |
|
|
081dc773a5 | |
|
|
551bb82960 | |
|
|
43c7c0f86c | |
|
|
fea629d00f | |
|
|
2a6de29364 | |
|
|
3d004fbf0a | |
|
|
12015a2174 | |
|
|
dfba84cb47 | |
|
|
d6a0f0d075 | |
|
|
14caedfa18 | |
|
|
61c7cd024d | |
|
|
e222814fc4 | |
|
|
16ca5e6fb1 | |
|
|
906aef3da8 | |
|
|
c890a9d9b4 | |
|
|
0ad6ceef59 | |
|
|
ab7d305b75 | |
|
|
49adc8b470 | |
|
|
6494251197 | |
|
|
9232af59ba | |
|
|
724ea71cf9 | |
|
|
dae7781052 | |
|
|
1335dfa785 | |
|
|
76684141a5 |
|
|
@ -202,6 +202,8 @@ whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
|
|||
|
||||
Note that transcription occasionally might be low accuracy when it works in parallel.
|
||||
|
||||
If n_processors is greater than 1, you cannot set any callbacks including new_segment_callback, progress_callback, encoder_begin_callback, abort_callback, and log_callback set by Whisper.log_set.
|
||||
|
||||
### Segments ###
|
||||
|
||||
Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
|
||||
|
|
|
|||
|
|
@ -112,6 +112,10 @@ ruby_whisper_log_callback(enum ggml_log_level level, const char * buffer, void *
|
|||
return;
|
||||
}
|
||||
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
|
||||
if (NIL_P(log_callback)) {
|
||||
return;
|
||||
}
|
||||
|
||||
VALUE udata = rb_iv_get(mWhisper, "user_data");
|
||||
rb_funcall(log_callback, id_call, 3, INT2NUM(level), rb_str_new2(buffer), udata);
|
||||
}
|
||||
|
|
@ -129,10 +133,16 @@ static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_d
|
|||
rb_iv_set(self, "log_callback", log_callback);
|
||||
rb_iv_set(self, "user_data", user_data);
|
||||
|
||||
VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
|
||||
rb_define_finalizer(log_callback, finalize_log_callback);
|
||||
if (!NIL_P(log_callback)) {
|
||||
VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
|
||||
rb_define_finalizer(log_callback, finalize_log_callback);
|
||||
}
|
||||
|
||||
whisper_log_set(ruby_whisper_log_callback, NULL);
|
||||
if (NIL_P(log_callback)) {
|
||||
whisper_log_set(NULL, NULL);
|
||||
} else {
|
||||
whisper_log_set(ruby_whisper_log_callback, NULL);
|
||||
}
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
#define RUBY_WHISPER_H
|
||||
|
||||
#include <ruby.h>
|
||||
#include <ruby/util.h>
|
||||
#include <ruby/memory_view.h>
|
||||
#include "whisper.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ extern const rb_data_type_t ruby_whisper_context_params_type;
|
|||
extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
|
||||
extern VALUE rb_whisper_model_s_new(VALUE context);
|
||||
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
|
||||
extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
|
||||
extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context, int n_processors);
|
||||
|
||||
ID transcribe_option_names[1];
|
||||
|
||||
|
|
@ -436,7 +436,7 @@ full_body(VALUE rb_args)
|
|||
GetContext(*args->context, rw);
|
||||
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
|
||||
prepare_transcription(rwp, args->context);
|
||||
prepare_transcription(rwp, args->context, 1);
|
||||
int result = whisper_full(rw->context, rwp->params, args->samples, args->n_samples);
|
||||
|
||||
return INT2NUM(result);
|
||||
|
|
@ -487,7 +487,7 @@ full_parallel_body(VALUE rb_args)
|
|||
GetContext(*args->context, rw);
|
||||
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
|
||||
prepare_transcription(rwp, args->context);
|
||||
prepare_transcription(rwp, args->context, args->n_processors);
|
||||
int result = whisper_full_parallel(rw->context, rwp->params, args->samples, args->n_samples, args->n_processors);
|
||||
|
||||
return INT2NUM(result);
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@
|
|||
|
||||
extern VALUE cParams;
|
||||
extern VALUE cVADParams;
|
||||
extern VALUE mWhisper;
|
||||
|
||||
extern ID id_call;
|
||||
|
||||
|
|
@ -186,6 +187,35 @@ static bool abort_callback(void * user_data) {
|
|||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
check_thread_safety(ruby_whisper_params *rwp, VALUE *context, int n_processors)
|
||||
{
|
||||
if (n_processors == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
|
||||
rb_raise(rb_eRuntimeError, "new segment callback not supported on parallel transcription");
|
||||
}
|
||||
|
||||
if (!NIL_P(rwp->progress_callback_container->callback) || 0 != RARRAY_LEN(rwp->progress_callback_container->callbacks)) {
|
||||
rb_raise(rb_eRuntimeError, "progress callback not supported on parallel transcription");
|
||||
}
|
||||
|
||||
if (!NIL_P(rwp->encoder_begin_callback_container->callback) || 0 != RARRAY_LEN(rwp->encoder_begin_callback_container->callbacks)) {
|
||||
rb_raise(rb_eRuntimeError, "encoder begin callback not supported on parallel transcription");
|
||||
}
|
||||
|
||||
if (!NIL_P(rwp->abort_callback_container->callback) || 0 != RARRAY_LEN(rwp->abort_callback_container->callbacks)) {
|
||||
rb_raise(rb_eRuntimeError, "abort callback not supported on parallel transcription");
|
||||
}
|
||||
|
||||
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
|
||||
if (!NIL_P(log_callback)) {
|
||||
rb_raise(rb_eRuntimeError, "log callback not supported for parallel transcription");
|
||||
}
|
||||
}
|
||||
|
||||
static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
|
||||
if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
|
||||
rwp->new_segment_callback_container->context = context;
|
||||
|
|
@ -219,9 +249,13 @@ static void set_vad_params(ruby_whisper_params *rwp)
|
|||
rwp->params.vad_params = rwvp->params;
|
||||
}
|
||||
|
||||
/*
|
||||
TODO: Set abort callback to trap SIGINT and SIGTERM
|
||||
*/
|
||||
void
|
||||
prepare_transcription(ruby_whisper_params *rwp, VALUE *context)
|
||||
prepare_transcription(ruby_whisper_params *rwp, VALUE *context, int n_processors)
|
||||
{
|
||||
check_thread_safety(rwp, context, n_processors);
|
||||
register_callbacks(rwp, context);
|
||||
set_vad_params(rwp);
|
||||
}
|
||||
|
|
@ -240,6 +274,20 @@ rb_whisper_params_mark(void *p)
|
|||
void
|
||||
ruby_whisper_params_free(ruby_whisper_params *rwp)
|
||||
{
|
||||
if (rwp->params.language) {
|
||||
ruby_xfree((void *)rwp->params.language);
|
||||
}
|
||||
if (rwp->params.initial_prompt) {
|
||||
ruby_xfree((void *)rwp->params.initial_prompt);
|
||||
}
|
||||
if (rwp->params.vad_model_path) {
|
||||
ruby_xfree((void *)rwp->params.vad_model_path);
|
||||
}
|
||||
|
||||
xfree(rwp->new_segment_callback_container);
|
||||
xfree(rwp->progress_callback_container);
|
||||
xfree(rwp->encoder_begin_callback_container);
|
||||
xfree(rwp->abort_callback_container);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -248,7 +296,7 @@ rb_whisper_params_free(void *p)
|
|||
ruby_whisper_params *rwp = (ruby_whisper_params *)p;
|
||||
// How to free user_data and callback only when not referred to by others?
|
||||
ruby_whisper_params_free(rwp);
|
||||
free(rwp);
|
||||
xfree(rwp);
|
||||
}
|
||||
|
||||
static size_t
|
||||
|
|
@ -276,6 +324,15 @@ ruby_whisper_params_allocate(VALUE klass)
|
|||
ruby_whisper_params *rwp;
|
||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
if (rwp->params.language != NULL) {
|
||||
rwp->params.language = ruby_strdup(rwp->params.language);
|
||||
}
|
||||
if (rwp->params.initial_prompt != NULL) {
|
||||
rwp->params.initial_prompt = ruby_strdup(rwp->params.initial_prompt);
|
||||
}
|
||||
if (rwp->params.vad_model_path != NULL) {
|
||||
rwp->params.vad_model_path = ruby_strdup(rwp->params.vad_model_path);
|
||||
}
|
||||
rwp->diarize = false;
|
||||
rwp->vad_params = TypedData_Wrap_Struct(cVADParams, &ruby_whisper_vad_params_type, (void *)&rwp->params.vad_params);
|
||||
rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
|
||||
|
|
@ -296,10 +353,12 @@ ruby_whisper_params_set_language(VALUE self, VALUE value)
|
|||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
ruby_xfree((void *)rwp->params.language);
|
||||
rwp->params.language = NULL;
|
||||
if (value == Qfalse || value == Qnil) {
|
||||
rwp->params.language = "auto";
|
||||
rwp->params.language = ruby_strdup("auto");
|
||||
} else {
|
||||
rwp->params.language = StringValueCStr(value);
|
||||
rwp->params.language = ruby_strdup(StringValueCStr(value));
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
|
@ -608,7 +667,13 @@ ruby_whisper_params_set_initial_prompt(VALUE self, VALUE value)
|
|||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
rwp->params.initial_prompt = StringValueCStr(value);
|
||||
ruby_xfree((void *)rwp->params.initial_prompt);
|
||||
rwp->params.initial_prompt = NULL;
|
||||
if (NIL_P(value)) {
|
||||
rwp->params.initial_prompt = NULL;
|
||||
} else {
|
||||
rwp->params.initial_prompt = ruby_strdup(StringValueCStr(value));
|
||||
}
|
||||
return value;
|
||||
}
|
||||
/*
|
||||
|
|
@ -1103,12 +1168,14 @@ ruby_whisper_params_set_vad_model_path(VALUE self, VALUE value)
|
|||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
ruby_xfree((void *)rwp->params.vad_model_path);
|
||||
rwp->params.vad_model_path = NULL;
|
||||
if (NIL_P(value)) {
|
||||
rwp->params.vad_model_path = NULL;
|
||||
return value;
|
||||
}
|
||||
VALUE path = ruby_whisper_normalize_model_path(value);
|
||||
rwp->params.vad_model_path = StringValueCStr(path);
|
||||
rwp->params.vad_model_path = ruby_strdup(StringValueCStr(path));
|
||||
return value;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ extern ID id_to_path;
|
|||
extern ID transcribe_option_names[1];
|
||||
|
||||
extern void
|
||||
prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
|
||||
prepare_transcription(ruby_whisper_params * rwp, VALUE * self, int n_processors);
|
||||
|
||||
/*
|
||||
* transcribe a single file
|
||||
|
|
@ -73,7 +73,7 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
|||
// rwp->params.encoder_begin_callback_user_data = &is_aborted;
|
||||
// }
|
||||
|
||||
prepare_transcription(rwp, &self);
|
||||
prepare_transcription(rwp, &self, n_processors);
|
||||
|
||||
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ module Whisper
|
|||
def self.lang_id: (string name) -> Integer
|
||||
def self.lang_str: (Integer id) -> String
|
||||
def self.lang_str_full: (Integer id) -> String
|
||||
def self.log_set: (log_callback, Object? user_data) -> log_callback
|
||||
def self.log_set: (log_callback?, Object? user_data) -> log_callback
|
||||
def self.system_info_str: () -> String
|
||||
|
||||
class Context
|
||||
|
|
@ -52,6 +52,9 @@ module Whisper
|
|||
# puts text
|
||||
# end
|
||||
#
|
||||
# If n_processors is greater than 1, you cannot set any callbacks including
|
||||
# new_segment_callback, progress_callback, encoder_begin_callback, abort_callback,
|
||||
# and log_callback set by Whisper.log_set
|
||||
def transcribe: (path, Params, ?n_processors: Integer) -> self
|
||||
| (path, Params, ?n_processors: Integer) { (String) -> void } -> self
|
||||
|
||||
|
|
@ -129,6 +132,9 @@ module Whisper
|
|||
# It seems this approach can offer some speedup in some cases.
|
||||
# However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||
#
|
||||
# If n_processors is greater than 1, you cannot set any callbacks including
|
||||
# new_segment_callback, progress_callback, encoder_begin_callback, abort_callback,
|
||||
# and log_callback set by Whisper.log_set
|
||||
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
|
||||
|
|
|
|||
|
|
@ -46,6 +46,8 @@ class TestParams < TestBase
|
|||
def test_language
|
||||
@params.language = "en"
|
||||
assert_equal @params.language, "en"
|
||||
GC.compact
|
||||
assert_equal @params.language, "en"
|
||||
@params.language = "auto"
|
||||
assert_equal @params.language, "auto"
|
||||
end
|
||||
|
|
|
|||
|
|
@ -43,9 +43,20 @@ class TestWhisper < TestBase
|
|||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
|
||||
@whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
|
||||
assert_match(/what you can do for your country/i, text)
|
||||
}
|
||||
without_log_callback do
|
||||
@whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
|
||||
assert_match(/what you can do for your country/i, text)
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def without_log_callback
|
||||
Whisper.log_set nil, nil
|
||||
yield
|
||||
ensure
|
||||
Whisper.log_set ->(level, buffer, user_data) {}, nil
|
||||
end
|
||||
|
||||
sub_test_case "After transcription" do
|
||||
|
|
@ -229,7 +240,9 @@ class TestWhisper < TestBase
|
|||
|
||||
def test_full_parallel
|
||||
nprocessors = 2
|
||||
@whisper.full_parallel(@params, @samples, @samples.length, nprocessors)
|
||||
without_log_callback do
|
||||
@whisper.full_parallel(@params, @samples, @samples.length, nprocessors)
|
||||
end
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
|
|
@ -240,7 +253,9 @@ class TestWhisper < TestBase
|
|||
def test_full_parallel_with_memory_view
|
||||
nprocessors = 2
|
||||
samples = JFKReader.new(AUDIO)
|
||||
@whisper.full_parallel(@params, samples, nil, nprocessors)
|
||||
without_log_callback do
|
||||
@whisper.full_parallel(@params, samples, nil, nprocessors)
|
||||
end
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
|
|
@ -259,7 +274,9 @@ class TestWhisper < TestBase
|
|||
|
||||
def test_full_parallel_without_length
|
||||
nprocessors = 2
|
||||
@whisper.full_parallel(@params, @samples, nil, nprocessors)
|
||||
without_log_callback do
|
||||
@whisper.full_parallel(@params, @samples, nil, nprocessors)
|
||||
end
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ require_relative "extsources"
|
|||
Gem::Specification.new do |s|
|
||||
s.name = "whispercpp"
|
||||
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
||||
s.version = '1.3.6'
|
||||
s.version = '1.3.7'
|
||||
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
|
||||
s.email = 'todd.fisher@gmail.com'
|
||||
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ void bench_main(size_t index) {
|
|||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "If you wish, you can submit these results here:\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " https://github.com/ggerganov/whisper.cpp/issues/89\n");
|
||||
fprintf(stderr, " https://github.com/ggml-org/whisper.cpp/issues/89\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Please include the following information:\n");
|
||||
fprintf(stderr, "\n");
|
||||
|
|
|
|||
|
|
@ -157,7 +157,7 @@ static int whisper_bench_full(const whisper_params & params) {
|
|||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "If you wish, you can submit these results here:\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " https://github.com/ggerganov/whisper.cpp/issues/89\n");
|
||||
fprintf(stderr, " https://github.com/ggml-org/whisper.cpp/issues/89\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Please include the following information:\n");
|
||||
fprintf(stderr, "\n");
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@ extern "C" {
|
|||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
|
|
@ -189,6 +190,7 @@ extern "C" {
|
|||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta);
|
||||
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
|
|
|
|||
|
|
@ -121,6 +121,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
bli_thread_set_num_threads(ctx->n_threads);
|
||||
#elif defined(GGML_BLAS_USE_NVPL)
|
||||
nvpl_blas_set_num_threads(ctx->n_threads);
|
||||
#elif defined(GGML_BLAS_USE_MKL)
|
||||
mkl_set_num_threads(ctx->n_threads);
|
||||
#endif
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
|
|
|
|||
|
|
@ -1544,8 +1544,8 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx,
|
|||
end = 2 * ((n_head - 1) - n_head_log2) + 1;
|
||||
step = 2;
|
||||
count = n_head - n_head_log2;
|
||||
aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step,
|
||||
dtype);
|
||||
aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * ggml_type_size(dtype), m1, count, start, end + 1,
|
||||
step, dtype);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1788,9 +1788,11 @@ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||
ggml_tensor * src0 = dst->src[0]; // src
|
||||
ggml_tensor * src1 = dst->src[1]; // index
|
||||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16
|
||||
|| dst->type == GGML_TYPE_BF16);
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
if (src0->type == dst->type) {
|
||||
|
|
@ -1881,6 +1883,7 @@ void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||
break;
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
|
||||
|
|
@ -1891,7 +1894,7 @@ void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
|
||||
dst->type);
|
||||
|
|
@ -1965,7 +1968,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor *
|
|||
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||
if (weight_to_nz && is_matmul_weight(weight)) {
|
||||
if (weight_to_nz && weight->type != GGML_TYPE_BF16 && is_matmul_weight(weight)) {
|
||||
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
|
||||
} else {
|
||||
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
||||
|
|
@ -2146,6 +2149,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
#ifndef ASCEND_310P
|
||||
case GGML_TYPE_BF16:
|
||||
#endif
|
||||
ggml_cann_mat_mul_fp(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
|
@ -2943,6 +2949,27 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||
// Rotate full tensor (no tail), using trans tensors
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
|
||||
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
|
||||
} else if (src0->data == dst->data && !ggml_is_contiguous(src0)) {
|
||||
// In-place on non-contiguous tensor: RotaryPositionEmbedding cannot safely
|
||||
// read and write the same non-contiguous buffer. Use contiguous temporaries.
|
||||
size_t contiguous_nb[GGML_MAX_DIMS];
|
||||
contiguous_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
contiguous_nb[i] = contiguous_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
int64_t total_elements = ggml_nelements(src0);
|
||||
ggml_cann_pool_alloc inplace_src_alloc(ctx.pool(), total_elements * sizeof(float));
|
||||
ggml_cann_pool_alloc inplace_dst_alloc(ctx.pool(), total_elements * sizeof(float));
|
||||
|
||||
acl_tensor_ptr acl_src_contig = ggml_cann_create_tensor(inplace_src_alloc.get(), ACL_FLOAT, sizeof(float),
|
||||
src0->ne, contiguous_nb, GGML_MAX_DIMS);
|
||||
acl_tensor_ptr acl_dst_contig = ggml_cann_create_tensor(inplace_dst_alloc.get(), ACL_FLOAT, sizeof(float),
|
||||
dst->ne, contiguous_nb, GGML_MAX_DIMS);
|
||||
|
||||
cann_copy(ctx, acl_src.get(), acl_src_contig.get());
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_contig.get(), acl_cos_reshape_tensor.get(),
|
||||
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_contig.get());
|
||||
cann_copy(ctx, acl_dst_contig.get(), acl_dst.get());
|
||||
} else {
|
||||
// Rotate full tensor (no tail), using original tensors
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
|
||||
|
|
@ -2984,6 +3011,58 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_cann_rope_cache_preload(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
int sections[4];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int) * 4);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
|
||||
const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
|
||||
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||
|
||||
if (is_imrope || mrope_used) {
|
||||
is_neox = true;
|
||||
}
|
||||
|
||||
int64_t rope_dims = n_dims;
|
||||
if (is_vision) {
|
||||
rope_dims = src0->ne[0];
|
||||
}
|
||||
|
||||
// Run the full cache init on the non-captured stream. This performs all
|
||||
// host-to-device memcpy, aclrtMalloc/Free, and on-device computations
|
||||
// so that the memory pool is warmed up and cache metadata is populated.
|
||||
aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
|
||||
mrope_used, is_imrope, is_vision, rope_dims);
|
||||
|
||||
// Reset `cached` so that during graph capture the on-device computations
|
||||
// (sin/cos, position multiply, repeat, etc.) still execute and get recorded
|
||||
// into the captured graph. The cache metadata (theta_scale_length,
|
||||
// theta_scale, sections, position_length, etc.) remains set, which causes
|
||||
// all host-to-device copy and malloc/free branches to be skipped.
|
||||
ctx.rope_cache.cached = false;
|
||||
}
|
||||
|
||||
void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
|
|
@ -3599,6 +3678,44 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
|||
acl_k_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS);
|
||||
acl_v_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS);
|
||||
|
||||
// Step 2.5: Pad Q, K, V along head dimension if D is not a multiple of 16
|
||||
// (required by FusedInferAttentionScoreV2)
|
||||
const int64_t D = src0->ne[0];
|
||||
const int64_t D_padded = GGML_PAD(D, 16);
|
||||
const bool needs_padding = (D != D_padded);
|
||||
|
||||
ggml_cann_pool_alloc q_pad_allocator(ctx.pool());
|
||||
ggml_cann_pool_alloc k_pad_allocator(ctx.pool());
|
||||
ggml_cann_pool_alloc v_pad_allocator(ctx.pool());
|
||||
|
||||
if (needs_padding) {
|
||||
int64_t paddings[] = { 0, D_padded - D, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
auto pad_fa_tensor = [&](acl_tensor_ptr & tensor, const int64_t * bsnd_ne,
|
||||
ggml_cann_pool_alloc & allocator) {
|
||||
int64_t pad_ne[GGML_MAX_DIMS] = { D_padded, bsnd_ne[1], bsnd_ne[2], bsnd_ne[3] };
|
||||
size_t pad_nb[GGML_MAX_DIMS];
|
||||
pad_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
pad_nb[i] = pad_nb[i - 1] * pad_ne[i - 1];
|
||||
}
|
||||
int64_t nelements = pad_ne[0] * pad_ne[1] * pad_ne[2] * pad_ne[3];
|
||||
void * buffer = allocator.alloc(nelements * faElemSize);
|
||||
acl_tensor_ptr padded =
|
||||
ggml_cann_create_tensor(buffer, faDataType, faElemSize, pad_ne, pad_nb, GGML_MAX_DIMS);
|
||||
aclnn_pad(ctx, tensor.get(), padded.get(), paddings);
|
||||
tensor = std::move(padded);
|
||||
};
|
||||
|
||||
pad_fa_tensor(acl_q_tensor, src0_bsnd_ne, q_pad_allocator);
|
||||
pad_fa_tensor(acl_k_tensor, src1_bsnd_ne, k_pad_allocator);
|
||||
pad_fa_tensor(acl_v_tensor, src2_bsnd_ne, v_pad_allocator);
|
||||
|
||||
src0_bsnd_ne[0] = D_padded;
|
||||
src1_bsnd_ne[0] = D_padded;
|
||||
src2_bsnd_ne[0] = D_padded;
|
||||
}
|
||||
|
||||
// Step 3: create the PSEShift tensor if needed
|
||||
// this tensor is considered as mask (f16) in the llama.cpp
|
||||
acl_tensor_ptr bcast_pse_tensor;
|
||||
|
|
@ -3688,17 +3805,16 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
|||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
acl_tensor_ptr fa_dst_tensor;
|
||||
acl_tensor_ptr acl_dst_tensor;
|
||||
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
|
||||
|
||||
if (dst->type == GGML_TYPE_F32 || needs_padding) {
|
||||
int64_t * out_f16_ne = src0_bsnd_ne;
|
||||
size_t out_f16_nb[GGML_MAX_DIMS];
|
||||
out_f16_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
|
||||
}
|
||||
int64_t out_nelements = out_f16_ne[0] * out_f16_ne[1] * out_f16_ne[2] * out_f16_ne[3];
|
||||
void * out_f16_buffer = out_f16_allocator.alloc(out_nelements * faElemSize);
|
||||
|
||||
fa_dst_tensor =
|
||||
ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS);
|
||||
|
|
@ -3730,8 +3846,33 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
|||
nullptr // softmaxLse
|
||||
);
|
||||
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
// Step 6: post-processing, permute and cast to f32
|
||||
// Step 6: post-processing — slice padded output and/or cast to f32
|
||||
if (needs_padding) {
|
||||
ggml_cann_pool_alloc sliced_f16_allocator(ctx.pool());
|
||||
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
int64_t sliced_ne[GGML_MAX_DIMS] = { D, src0_bsnd_ne[1], src0_bsnd_ne[2], src0_bsnd_ne[3] };
|
||||
size_t sliced_nb[GGML_MAX_DIMS];
|
||||
sliced_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
sliced_nb[i] = sliced_nb[i - 1] * sliced_ne[i - 1];
|
||||
}
|
||||
int64_t sliced_nelements = sliced_ne[0] * sliced_ne[1] * sliced_ne[2] * sliced_ne[3];
|
||||
void * sliced_buffer = sliced_f16_allocator.alloc(sliced_nelements * faElemSize);
|
||||
acl_tensor_ptr sliced_f16_tensor = ggml_cann_create_tensor(sliced_buffer, faDataType, faElemSize,
|
||||
sliced_ne, sliced_nb, GGML_MAX_DIMS);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Slice, fa_dst_tensor.get(),
|
||||
(int64_t) -1, (int64_t) 0, D, (int64_t) 1, sliced_f16_tensor.get());
|
||||
|
||||
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, sliced_f16_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||
} else {
|
||||
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Slice, fa_dst_tensor.get(),
|
||||
(int64_t) -1, (int64_t) 0, D, (int64_t) 1, acl_dst_tensor.get());
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_F32) {
|
||||
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, fa_dst_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -543,6 +543,21 @@ void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
|||
*/
|
||||
void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Pre-load the RoPE cache before ACL graph capture.
|
||||
*
|
||||
* This function must be called outside of graph capture to perform
|
||||
* host-to-device memory copies and device memory allocations that are
|
||||
* not allowed on a captured stream. After pre-loading, the rope cache
|
||||
* metadata is updated so that the subsequent call to
|
||||
* aclnn_rope_cache_init (inside graph capture) skips these operations
|
||||
* and only records the on-device computations into the captured graph.
|
||||
*
|
||||
* @param ctx CANN backend context.
|
||||
* @param dst A ROPE destination tensor from the computation graph.
|
||||
*/
|
||||
void ggml_cann_rope_cache_preload(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the index of the maximum value along the specified dimension
|
||||
* of a ggml tensor using the CANN backend.
|
||||
|
|
|
|||
|
|
@ -277,7 +277,7 @@ struct ggml_graph_node_properties {
|
|||
}
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
|
||||
if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU || node->op == GGML_OP_ROPE){
|
||||
return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
|
||||
}
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -1234,7 +1234,8 @@ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
||||
if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
||||
if (weight_to_nz && tensor->type != GGML_TYPE_BF16
|
||||
&& is_matmul_weight((const ggml_tensor *) tensor)) {
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
weight_format_to_nz(tensor, offset, ctx->device);
|
||||
|
|
@ -1443,7 +1444,8 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
||||
} else if (weight_to_nz && tensor->type != GGML_TYPE_BF16
|
||||
&& is_matmul_weight((const ggml_tensor *) tensor)) {
|
||||
// NZ format weight are not support quantized yet.
|
||||
// If ND tensor transform to NZ, size may changed.
|
||||
int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
|
||||
|
|
@ -2223,6 +2225,19 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
|
|||
// If no matching graph is found, add a new ACL graph.
|
||||
ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
|
||||
cann_ctx->graph_lru_cache.push(new_graph);
|
||||
|
||||
// Pre-load rope cache before graph capture. During capture the
|
||||
// stream cannot perform host-to-device memcpy or device memory
|
||||
// malloc/free. Running the full cache init now populates the
|
||||
// cache metadata so these branches are skipped during capture,
|
||||
// while also warming up the memory pool.
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->op == GGML_OP_ROPE) {
|
||||
ggml_cann_rope_cache_preload(*cann_ctx, node);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
|
@ -2283,6 +2298,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
|||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
#ifndef ASCEND_310P
|
||||
case GGML_TYPE_BF16:
|
||||
#endif
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return true;
|
||||
|
|
@ -2320,6 +2338,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
|||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
#ifndef ASCEND_310P
|
||||
case GGML_TYPE_BF16:
|
||||
#endif
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
default:
|
||||
|
|
@ -2332,6 +2353,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
|||
switch (op->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
#ifndef ASCEND_310P
|
||||
case GGML_TYPE_BF16:
|
||||
#endif
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
@ -2341,20 +2365,30 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
|||
case GGML_OP_CPY:
|
||||
{
|
||||
ggml_tensor * src = op->src[0];
|
||||
#ifdef ASCEND_310P
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
(src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
|
||||
// only support F32 and F16.
|
||||
// only support F32 and F16 on 310P.
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_BF16) ||
|
||||
(src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16 && src->type != GGML_TYPE_BF16)) {
|
||||
// only support F32, F16 and BF16.
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_CONT:
|
||||
{
|
||||
// TODO: support GGML_TYPE_BF16
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
#ifndef ASCEND_310P
|
||||
case GGML_TYPE_BF16:
|
||||
#endif
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
@ -2503,10 +2537,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
|||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] % 16 != 0) {
|
||||
// TODO: padding to support
|
||||
return false;
|
||||
}
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
|
||||
if (logitSoftcap != 0.0f) {
|
||||
|
|
|
|||
|
|
@ -460,6 +460,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
endif()
|
||||
if(NOT GGML_CPU_ALL_VARIANTS)
|
||||
set(MARCH_STR "rv64gc")
|
||||
if (GGML_RVV)
|
||||
string(APPEND MARCH_STR "v")
|
||||
endif()
|
||||
|
||||
if (GGML_RV_ZFH)
|
||||
string(APPEND MARCH_STR "_zfh")
|
||||
endif()
|
||||
|
|
@ -467,7 +471,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
if (GGML_XTHEADVECTOR)
|
||||
string(APPEND MARCH_STR "_xtheadvector")
|
||||
elseif (GGML_RVV)
|
||||
string(APPEND MARCH_STR "_v")
|
||||
if (GGML_RV_ZVFH)
|
||||
string(APPEND MARCH_STR "_zvfh")
|
||||
endif()
|
||||
|
|
@ -475,12 +478,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
string(APPEND MARCH_STR "_zvfbfwma")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_RV_ZICBOP)
|
||||
string(APPEND MARCH_STR "_zicbop")
|
||||
endif()
|
||||
if (GGML_RV_ZIHINTPAUSE)
|
||||
string(APPEND MARCH_STR "_zihintpause")
|
||||
endif()
|
||||
|
||||
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
||||
else()
|
||||
# Begin with the lowest baseline
|
||||
|
|
@ -570,24 +575,36 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "54049037570ab0ee0a0d126b2ba5ece1")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
set(KLEIDIAI_FETCH_ARGS
|
||||
URL ${KLEIDIAI_DOWNLOAD_URL}
|
||||
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
|
||||
)
|
||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
||||
list(APPEND KLEIDIAI_FETCH_ARGS DOWNLOAD_EXTRACT_TIMESTAMP NEW)
|
||||
endif()
|
||||
|
||||
# TODO: Use FetchContent_MakeAvailable with EXCLUDE_FROM_ALL after bumping minimum CMake version to 3.28+
|
||||
# Using FetchContent_Populate instead to avoid EXCLUDE_FROM_ALL which requires CMake 3.28
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
URL ${KLEIDIAI_DOWNLOAD_URL}
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP NEW
|
||||
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
|
||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.28")
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
${KLEIDIAI_FETCH_ARGS}
|
||||
EXCLUDE_FROM_ALL
|
||||
)
|
||||
|
||||
FetchContent_GetProperties(KleidiAI_Download
|
||||
SOURCE_DIR KLEIDIAI_SRC
|
||||
POPULATED KLEIDIAI_POPULATED)
|
||||
|
||||
if (NOT KLEIDIAI_POPULATED)
|
||||
FetchContent_Populate(KleidiAI_Download)
|
||||
FetchContent_MakeAvailable(KleidiAI_Download)
|
||||
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
|
||||
else()
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
${KLEIDIAI_FETCH_ARGS}
|
||||
)
|
||||
|
||||
FetchContent_GetProperties(KleidiAI_Download
|
||||
SOURCE_DIR KLEIDIAI_SRC
|
||||
POPULATED KLEIDIAI_POPULATED
|
||||
)
|
||||
|
||||
if (NOT KLEIDIAI_POPULATED)
|
||||
FetchContent_Populate(KleidiAI_Download)
|
||||
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
|
||||
|
|
|
|||
|
|
@ -115,10 +115,10 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|||
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q8_K * y_blocks = (block_q8_K *)y;
|
||||
size_t nb = k / QK_K;
|
||||
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
block_q8_K * y_blocks = (block_q8_K *)y;
|
||||
const size_t vlmax_f32m8 = __riscv_vsetvlmax_e32m8();
|
||||
|
||||
for (size_t i = 0; i < nb; i++) {
|
||||
|
|
@ -2052,6 +2052,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_iq1_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
|
|
@ -2147,6 +2148,7 @@ static void ggml_vec_dot_iq1_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t
|
|||
|
||||
*s = sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -2163,6 +2165,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_iq1_m_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
|
|
@ -2269,6 +2272,7 @@ static void ggml_vec_dot_iq1_m_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t
|
|||
|
||||
*s = sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -2285,6 +2289,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static const uint8_t sign_gather_indices_arr[64] = {
|
||||
0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,
|
||||
4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7
|
||||
|
|
@ -2488,6 +2493,7 @@ static void ggml_vec_dot_iq2_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t
|
|||
}
|
||||
*s = 0.125f * sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -2507,7 +2513,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined(__riscv_v)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
static const int8_t keven_signs_q2xs[1024] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
||||
|
|
@ -2542,7 +2548,6 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|||
1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
|
||||
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
};
|
||||
#endif
|
||||
|
||||
static void ggml_vec_dot_iq2_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
|
|
@ -2618,6 +2623,7 @@ static void ggml_vec_dot_iq2_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_
|
|||
}
|
||||
*s = 0.125f * sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -2634,6 +2640,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_iq2_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
|
|
@ -2818,6 +2825,7 @@ static void ggml_vec_dot_iq2_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size
|
|||
}
|
||||
*s = 0.125f * sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -2830,10 +2838,11 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
break;
|
||||
}
|
||||
#else
|
||||
ggml_vec_dot_iq2_xxs_q8_K(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_iq3_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -2928,6 +2937,7 @@ static void ggml_vec_dot_iq3_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t
|
|||
}
|
||||
*s = sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -2944,6 +2954,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
|
|
@ -3036,6 +3047,7 @@ static void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size
|
|||
}
|
||||
*s = 0.25f * sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -3052,6 +3064,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_iq4_nl_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -3161,6 +3174,7 @@ static void ggml_vec_dot_iq4_nl_q8_0_vl256(int n, float * GGML_RESTRICT s, size_
|
|||
|
||||
*s = sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -3177,6 +3191,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -3190,7 +3205,6 @@ static void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_
|
|||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
const vint8m4_t values = __riscv_vle8_v_i8m4(kvalues_iq4nl, 16);
|
||||
float sumf = 0;
|
||||
int acc[4];
|
||||
|
|
@ -3252,14 +3266,8 @@ static void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_
|
|||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -3276,6 +3284,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -3381,6 +3390,7 @@ static void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t
|
|||
|
||||
*s = sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -3397,6 +3407,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_tq2_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
|
|
@ -3467,6 +3478,7 @@ static void ggml_vec_dot_tq2_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t
|
|||
|
||||
*s = sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -3483,6 +3495,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_mxfp4_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -3592,6 +3605,7 @@ static void ggml_vec_dot_mxfp4_q8_0_vl256(int n, float * GGML_RESTRICT s, size_t
|
|||
|
||||
*s = sumf;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
|
|
@ -3604,6 +3618,6 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
break;
|
||||
}
|
||||
#else
|
||||
return ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
|
|
|||
|
|
@ -107,8 +107,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|||
}
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(y);
|
||||
ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
|
||||
ggml_quantize_mat_q8_0_4x8_generic(x, vy, k);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -203,6 +202,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
#if defined __riscv_zvfh
|
||||
void ggml_gemv_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -222,7 +222,6 @@ void ggml_gemv_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
|
||||
|
|
@ -256,9 +255,6 @@ void ggml_gemv_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
|
||||
__riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
ggml_gemv_q4_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -280,7 +276,6 @@ void ggml_gemv_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
||||
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
|
|
@ -392,9 +387,6 @@ void ggml_gemv_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
|
||||
__riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
ggml_gemv_q4_K_16x1_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -416,7 +408,6 @@ void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
const vint8mf2_t values = __riscv_vle8_v_i8mf2(kvalues_iq4nl, 16);
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
|
|
@ -451,9 +442,6 @@ void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
|
||||
__riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
ggml_gemv_iq4_nl_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -476,7 +464,6 @@ void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(blocklen);
|
||||
UNUSED(bs);
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
|
||||
|
|
@ -505,9 +492,6 @@ void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
|
||||
__riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
ggml_gemv_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -679,9 +663,9 @@ void ggml_gemv_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
|
||||
} // End K-Block
|
||||
__riscv_vse32_v_f32m2(s + col_tile, v_sumf, vl);
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
|
|
@ -909,6 +893,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
#if defined __riscv_zvfh
|
||||
void ggml_gemm_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -929,7 +914,6 @@ void ggml_gemm_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
|
|
@ -994,9 +978,6 @@ void ggml_gemm_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
__riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
ggml_gemm_q4_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -1019,7 +1000,6 @@ void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
|
|
@ -1267,9 +1247,6 @@ void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
__riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
ggml_gemm_q4_K_16x1_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -1292,7 +1269,6 @@ void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
const vint8mf2_t values = __riscv_vle8_v_i8mf2(kvalues_iq4nl, 16);
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
|
|
@ -1355,9 +1331,6 @@ void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
__riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
ggml_gemm_iq4_nl_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -1380,7 +1353,6 @@ void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
|
|
@ -1429,9 +1401,6 @@ void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
__riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
ggml_gemm_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -1731,3 +1700,4 @@ void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -531,7 +531,6 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|||
|
||||
UNUSED(bs);
|
||||
|
||||
__m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
|
||||
__m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
|
||||
|
||||
// Permute mask used for easier vector processing at later stages
|
||||
|
|
@ -580,6 +579,7 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|||
if constexpr (
|
||||
std::is_same_v<block_tx8, block_q4_0x8> ||
|
||||
std::is_same_v<block_tx8, block_iq4_nlx8>) {
|
||||
const __m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
|
||||
col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
|
||||
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
|
||||
// Load 8 E8M0 exponents and convert to float via LUT
|
||||
|
|
|
|||
|
|
@ -2871,8 +2871,12 @@ struct ggml_cplan ggml_graph_plan(
|
|||
const int64_t ne11 = node->src[1]->ne[1]; // H
|
||||
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
||||
|
||||
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
||||
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
||||
GGML_ASSERT(node->src[0]->type == GGML_TYPE_F16 || node->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(node->src[1]->type == GGML_TYPE_F32);
|
||||
|
||||
cur += ggml_type_size(node->src[0]->type) * ne00 * ne01 * ne02 * ne03;
|
||||
cur += ggml_type_size(node->src[0]->type) * ne10 * ne11 * ne12;
|
||||
|
||||
} break;
|
||||
case GGML_OP_TOP_K:
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1461,7 +1461,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
|||
return false;
|
||||
}
|
||||
if ((op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_I32) &&
|
||||
ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) {
|
||||
ggml_ne(op->src[1], 3) == 1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1473,10 +1473,12 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
|||
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
} else {
|
||||
if (op->src[0]->type != GGML_TYPE_F16) {
|
||||
return nullptr;
|
||||
}
|
||||
std::array<ggml_kleidiai_kernels *, GGML_KLEIDIAI_MAX_KERNEL_SLOTS> kernel_chain;
|
||||
const int slot_total = kleidiai_collect_kernel_chain(op, kernel_chain);
|
||||
const bool has_kernel = slot_total > 0;
|
||||
if (has_kernel && op->src[1]->ne[1] > 1) {
|
||||
if (slot_total > 0 && op->src[1]->ne[1] > 1) {
|
||||
if ((op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
||||
(op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
|
||||
return nullptr;
|
||||
|
|
|
|||
|
|
@ -3194,6 +3194,7 @@ class tinyBLAS_PPC {
|
|||
|
||||
private:
|
||||
|
||||
__attribute__((always_inline))
|
||||
inline void save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
|
||||
vec_t vec_C[4];
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||
|
|
@ -3204,6 +3205,7 @@ class tinyBLAS_PPC {
|
|||
}
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
inline void add_save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
|
||||
vec_t vec_C[4];
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||
|
|
|
|||
|
|
@ -6923,16 +6923,15 @@ void ggml_compute_forward_conv_3d(
|
|||
ggml_compute_forward_conv_3d_impl(params, src0, src1, dst, src0->type);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_transpose_2d
|
||||
|
||||
void ggml_compute_forward_conv_transpose_2d(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
template <typename kernel_t>
|
||||
static void ggml_compute_forward_conv_transpose_2d_impl(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
|
|
@ -6943,7 +6942,7 @@ void ggml_compute_forward_conv_transpose_2d(
|
|||
|
||||
const int nk = ne00*ne01*ne02*ne03;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb00 == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (ith == 0) {
|
||||
|
|
@ -6951,12 +6950,12 @@ void ggml_compute_forward_conv_transpose_2d(
|
|||
|
||||
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
||||
{
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
kernel_t * const wdata = (kernel_t *) params->wdata + 0;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
|
||||
ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
|
||||
const kernel_t * const src = (kernel_t *)((char *) src0->data + i03*nb03 + i02*nb02);
|
||||
kernel_t * dst_data = wdata + i02*ne01*ne00*ne03;
|
||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
|
||||
|
|
@ -6968,13 +6967,17 @@ void ggml_compute_forward_conv_transpose_2d(
|
|||
|
||||
// permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
|
||||
{
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
||||
kernel_t * const wdata = (kernel_t *) params->wdata + nk;
|
||||
for (int i12 = 0; i12 < ne12; i12++) {
|
||||
for (int i11 = 0; i11 < ne11; i11++) {
|
||||
const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
|
||||
ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
|
||||
kernel_t * dst_data = wdata + i11*ne10*ne12;
|
||||
for (int i10 = 0; i10 < ne10; i10++) {
|
||||
dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
|
||||
if constexpr (std::is_same_v<kernel_t, ggml_fp16_t>) {
|
||||
dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
|
||||
} else {
|
||||
dst_data[i10*ne12 + i12] = src[i10];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -6996,21 +6999,27 @@ void ggml_compute_forward_conv_transpose_2d(
|
|||
const int ip0 = dp*ith;
|
||||
const int ip1 = MIN(ip0 + dp, np);
|
||||
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
ggml_fp16_t * const wdata_src = wdata + nk;
|
||||
kernel_t * const wdata = (kernel_t *) params->wdata + 0;
|
||||
kernel_t * const wdata_src = wdata + nk;
|
||||
|
||||
for (int i2 = ip0; i2 < ip1; i2++) { // Cout
|
||||
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
||||
ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
|
||||
kernel_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
|
||||
for (int i11 = 0; i11 < ne11; i11++) {
|
||||
for (int i10 = 0; i10 < ne10; i10++) {
|
||||
const int i1n = i11*ne10*ne12 + i10*ne12;
|
||||
for (int i01 = 0; i01 < ne01; i01++) {
|
||||
for (int i00 = 0; i00 < ne00; i00++) {
|
||||
float v = 0;
|
||||
ggml_vec_dot_f16(ne03, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||
if constexpr (std::is_same_v<kernel_t, ggml_fp16_t>) {
|
||||
ggml_vec_dot_f16(ne03, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||
} else {
|
||||
ggml_vec_dot_f32(ne03, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||
}
|
||||
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
||||
}
|
||||
}
|
||||
|
|
@ -7019,6 +7028,28 @@ void ggml_compute_forward_conv_transpose_2d(
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_conv_transpose_2d(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_conv_transpose_2d_impl<ggml_fp16_t>(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_conv_transpose_2d_impl<float>(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d_dw
|
||||
|
||||
struct ggml_conv_2d_dw_params {
|
||||
|
|
|
|||
|
|
@ -1365,6 +1365,7 @@ void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
|
|||
}
|
||||
}
|
||||
|
||||
// Only enable these for RISC-V.
|
||||
#if defined __riscv_zvfh
|
||||
void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
|
|
@ -1568,6 +1569,7 @@ void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
assert(nc % 16 == 0);
|
||||
|
||||
UNUSED(bs);
|
||||
UNUSED(nr);
|
||||
|
||||
const int nb = n / QK_K;
|
||||
const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
|
||||
|
|
@ -2381,6 +2383,7 @@ void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
|||
}
|
||||
}
|
||||
|
||||
// Only enable these for RISC-V.
|
||||
#if defined __riscv_zvfh
|
||||
void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
|
|
|
|||
|
|
@ -116,12 +116,11 @@ if (CUDAToolkit_FOUND)
|
|||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||
else()
|
||||
file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
list(APPEND GGML_SOURCES_CUDA
|
||||
template-instances/fattn-vec-instance-f16-f16.cu
|
||||
template-instances/fattn-vec-instance-q4_0-q4_0.cu
|
||||
template-instances/fattn-vec-instance-q8_0-q8_0.cu
|
||||
template-instances/fattn-vec-instance-bf16-bf16.cu)
|
||||
endif()
|
||||
|
||||
ggml_add_backend_library(ggml-cuda
|
||||
|
|
|
|||
|
|
@ -799,6 +799,22 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
|||
#endif // CUDART_VERSION >= 12050
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
|
||||
#ifdef FP8_AVAILABLE
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA3)
|
||||
// ROCm dose not support fp8 in software on devices with fp8 hardware,
|
||||
// but CDNA3 supports only e4m3_fnuz (no inf).
|
||||
const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast<const __hip_fp8_e4m3_fnuz *>(&bits);
|
||||
#else
|
||||
const __nv_fp8_e4m3 xf = *reinterpret_cast<const __nv_fp8_e4m3 *>(&bits);
|
||||
#endif // defined(GGML_USE_HIP) && defined(GGML_USE_HIP)
|
||||
return static_cast<float>(xf) / 2;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP8_AVAILABLE
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
|
||||
const uint8_t sign_bit = (x < 0.0f) << 3;
|
||||
float ax = fabsf(x) * e;
|
||||
|
|
@ -931,6 +947,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> {
|
|||
static constexpr int qi = QI_MXFP4;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_NVFP4> {
|
||||
static constexpr int qk = QK_NVFP4;
|
||||
static constexpr int qr = QR_NVFP4;
|
||||
static constexpr int qi = QI_NVFP4;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
|
||||
static constexpr int qk = QK_K;
|
||||
|
|
|
|||
|
|
@ -1,12 +1,20 @@
|
|||
#include <algorithm>
|
||||
|
||||
#include "conv2d-transpose.cuh"
|
||||
#include "ggml.h"
|
||||
#include "convert.cuh"
|
||||
|
||||
__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
|
||||
float * __restrict__ output, const int in_w, const int in_h, const int out_w,
|
||||
const int out_h, const int kernel_w, const int kernel_h, const int stride,
|
||||
const int c_in, const int c_out, const int batches) {
|
||||
template <typename kernel_t>
|
||||
static __global__ void conv2d_transpose_kernel(const float * __restrict__ input,
|
||||
const kernel_t * __restrict__ kernel,
|
||||
float * __restrict__ output,
|
||||
const int in_w,
|
||||
const int in_h,
|
||||
const int out_w,
|
||||
const int out_h,
|
||||
const int kernel_w,
|
||||
const int kernel_h,
|
||||
const int stride,
|
||||
const int c_in,
|
||||
const int c_out,
|
||||
const int batches) {
|
||||
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
const int total_elements = out_w * out_h * c_out * batches;
|
||||
|
|
@ -26,24 +34,32 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const
|
|||
for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
|
||||
for (int kh = 0; kh < kernel_h; ++kh) {
|
||||
int in_y = out_y_idx - kh;
|
||||
if (in_y < 0 || in_y % stride) continue;
|
||||
if (in_y < 0 || in_y % stride) {
|
||||
continue;
|
||||
}
|
||||
in_y /= stride;
|
||||
if (in_y >= in_h) continue;
|
||||
if (in_y >= in_h) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int kw = 0; kw < kernel_w; ++kw) {
|
||||
int in_x = out_x_idx - kw;
|
||||
if (in_x < 0 || in_x % stride) continue;
|
||||
if (in_x < 0 || in_x % stride) {
|
||||
continue;
|
||||
}
|
||||
in_x /= stride;
|
||||
if (in_x >= in_w) continue;
|
||||
if (in_x >= in_w) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
|
||||
const int kernel_idx =
|
||||
(kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
|
||||
|
||||
float input_val = input[input_idx];
|
||||
half kern_val = kernel[kernel_idx];
|
||||
float input_val = input[input_idx];
|
||||
kernel_t kern_val = kernel[kernel_idx];
|
||||
|
||||
accumulator += input_val * (float) kern_val;
|
||||
accumulator += input_val * ggml_cuda_cast<float>(kern_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -56,11 +72,12 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
|
|||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
|
||||
const float * input_data = (const float *) input->data;
|
||||
float * output_data = (float *) dst->data;
|
||||
const half * kernel_data = (const half *) kernel->data;
|
||||
const void * kernel_data = kernel->data;
|
||||
|
||||
const int input_w = input->ne[0];
|
||||
const int input_h = input->ne[1];
|
||||
|
|
@ -82,10 +99,17 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
|
|||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const int total = (output_w * output_h * channels_out * batches);
|
||||
const int total = output_w * output_h * channels_out * batches;
|
||||
const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE;
|
||||
|
||||
conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
||||
input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
|
||||
channels_in, channels_out, batches);
|
||||
if (kernel->type == GGML_TYPE_F16) {
|
||||
conv2d_transpose_kernel<half><<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
||||
input_data, (const half *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w,
|
||||
kernel_h, stride, channels_in, channels_out, batches);
|
||||
|
||||
} else {
|
||||
conv2d_transpose_kernel<float><<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
||||
input_data, (const float *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w,
|
||||
kernel_h, stride, channels_in, channels_out, batches);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
|
|
|||
|
|
@ -617,6 +617,45 @@ static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t
|
|||
dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static __global__ void dequantize_block_nvfp4(
|
||||
const void * __restrict__ vx,
|
||||
dst_t * __restrict__ yy,
|
||||
const int64_t ne) {
|
||||
const int64_t i = blockIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
const int64_t base = i * QK_NVFP4;
|
||||
if (base >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const block_nvfp4 * x = (const block_nvfp4 *) vx;
|
||||
const block_nvfp4 & xb = x[i];
|
||||
|
||||
const int sub = tid / (QK_NVFP4_SUB / 2);
|
||||
const int j = tid % (QK_NVFP4_SUB / 2);
|
||||
|
||||
const float d = ggml_cuda_ue4m3_to_fp32(xb.d[sub]);
|
||||
const uint8_t q = xb.qs[sub * (QK_NVFP4_SUB / 2) + j];
|
||||
|
||||
const int64_t y0 = base + sub * QK_NVFP4_SUB + j;
|
||||
const int64_t y1 = y0 + QK_NVFP4_SUB / 2;
|
||||
|
||||
yy[y0] = ggml_cuda_cast<dst_t>(d * kvalues_mxfp4[q & 0x0F]);
|
||||
yy[y1] = ggml_cuda_cast<dst_t>(d * kvalues_mxfp4[q >> 4]);
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_nvfp4_cuda(
|
||||
const void * vx,
|
||||
dst_t * y,
|
||||
const int64_t k,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(k % QK_NVFP4 == 0);
|
||||
const int nb = k / QK_NVFP4;
|
||||
dequantize_block_nvfp4<<<nb, 32, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
template <typename src_t, typename dst_t>
|
||||
static __global__ void convert_unary(
|
||||
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
|
||||
|
|
@ -715,6 +754,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_MXFP4:
|
||||
return dequantize_row_mxfp4_cuda;
|
||||
case GGML_TYPE_NVFP4:
|
||||
return dequantize_row_nvfp4_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
|
|
@ -766,6 +807,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_MXFP4:
|
||||
return dequantize_row_mxfp4_cuda;
|
||||
case GGML_TYPE_NVFP4:
|
||||
return dequantize_row_nvfp4_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cont_cuda<half>;
|
||||
case GGML_TYPE_BF16:
|
||||
|
|
|
|||
|
|
@ -41,6 +41,16 @@ template<typename dst_t, typename src_t>
|
|||
return __bfloat162float(x);
|
||||
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
|
||||
return __float22half2_rn(x);
|
||||
} else if constexpr(std::is_same_v<src_t, nv_bfloat162> && std::is_same_v<dst_t, float2>) {
|
||||
#ifdef GGML_USE_HIP
|
||||
return make_float2(__bfloat162float(__low2bfloat16(x)), __bfloat162float(__high2bfloat16(x)));
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
return __bfloat1622float2(x);
|
||||
#else
|
||||
return make_float2(__bfloat162float(x.x), __bfloat162float(x.y));
|
||||
#endif // __CUDA_ARCH__ >= 800
|
||||
#endif // GGML_USE_HIP
|
||||
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
|
||||
// bypass compile error on cuda 12.0.1
|
||||
#ifdef GGML_USE_HIP
|
||||
|
|
|
|||
|
|
@ -74,6 +74,37 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16(
|
|||
return sum;
|
||||
}
|
||||
|
||||
template <int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_bf16(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const nv_bfloat162 * K_bf16 = (const nv_bfloat162 *) K_c;
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
|
||||
__align__(16) nv_bfloat162 tmp[cpy_ne];
|
||||
ggml_cuda_memcpy_1<sizeof(tmp)>(tmp, K_bf16 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne);
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
// FIXME replace macros in vector FA kernel with templating and use FP32 for BF16
|
||||
ggml_cuda_mad(sum, ggml_cuda_cast<float2>(tmp[k_KQ_1]), __half22float2(((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]));
|
||||
#else
|
||||
ggml_cuda_mad(sum, ggml_cuda_cast<float2>(tmp[k_KQ_1]), ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
|
||||
#endif // V_DOT2_F32_F16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
|
@ -321,6 +352,19 @@ static __device__ __forceinline__ void dequantize_V_f16(const void * __restrict_
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_bf16(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
static_assert(std::is_same_v<T, float>, "BF16 V dequantization only supports float output");
|
||||
static_assert(ne % 2 == 0, "bad ne");
|
||||
__align__(16) nv_bfloat162 tmp[ne/2];
|
||||
ggml_cuda_memcpy_1<ne*sizeof(nv_bfloat16)>(tmp, (const nv_bfloat16 *) vx + i0);
|
||||
float2 * dst_f2 = (float2 *) dst;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne/2; ++l) {
|
||||
dst_f2[l] = ggml_cuda_cast<float2>(tmp[l]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
|
@ -547,6 +591,8 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
|
|||
return vec_dot_fattn_vec_KQ_q5_1<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_Q8_0) {
|
||||
return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_BF16) {
|
||||
return vec_dot_fattn_vec_KQ_bf16<D, nthreads>;
|
||||
} else {
|
||||
static_assert(type_K == -1, "bad type");
|
||||
return nullptr;
|
||||
|
|
@ -567,6 +613,8 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
|
|||
return dequantize_V_q5_1<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_Q8_0) {
|
||||
return dequantize_V_q8_0<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_BF16) {
|
||||
return dequantize_V_bf16<float, ne>;
|
||||
} else {
|
||||
static_assert(type_V == -1, "bad type");
|
||||
return nullptr;
|
||||
|
|
|
|||
|
|
@ -75,17 +75,17 @@ static __global__ void flash_attn_ext_vec(
|
|||
#endif // GGML_USE_HIP
|
||||
|
||||
constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device();
|
||||
constexpr int nthreads_KQ = type_K == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_KQ_q;
|
||||
constexpr int nthreads_V = type_V == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_V_q;
|
||||
constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q;
|
||||
constexpr int nthreads_V = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q;
|
||||
|
||||
static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
|
||||
static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V");
|
||||
|
||||
constexpr int V_rows_per_thread = type_V == GGML_TYPE_F16 ? 2*cpy_ne : 4;
|
||||
constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4;
|
||||
constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V;
|
||||
|
||||
constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
|
||||
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
|
||||
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16;
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half, V_rows_per_thread>();
|
||||
#else
|
||||
|
|
@ -323,8 +323,18 @@ static __global__ void flash_attn_ext_vec(
|
|||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
half2 tmp[V_rows_per_thread/2];
|
||||
dequantize_V(V + k*nb21, tmp,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
if constexpr (type_V == GGML_TYPE_BF16) {
|
||||
float2 tmp_f[V_rows_per_thread/2];
|
||||
dequantize_V(V + k*nb21, tmp_f,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
#pragma unroll
|
||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
||||
tmp[i_VKQ_1] = __float22half2_rn(tmp_f[i_VKQ_1]);
|
||||
}
|
||||
} else {
|
||||
dequantize_V(V + k*nb21, tmp,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
||||
#pragma unroll
|
||||
|
|
@ -563,6 +573,7 @@ void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_ten
|
|||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_0); \
|
||||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_1); \
|
||||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q8_0); \
|
||||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_BF16); \
|
||||
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_F16)
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_0)
|
||||
|
|
@ -570,6 +581,7 @@ EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_1)
|
|||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q8_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_BF16)
|
||||
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_F16)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_0)
|
||||
|
|
@ -577,6 +589,7 @@ EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_1)
|
|||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q8_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_BF16)
|
||||
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_F16)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_0)
|
||||
|
|
@ -584,3 +597,4 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_1)
|
|||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16)
|
||||
|
|
|
|||
|
|
@ -224,6 +224,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
|
|||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_F16)
|
||||
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
|
||||
|
|
@ -231,6 +232,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
|
|||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q4_0)
|
||||
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
|
||||
|
|
@ -238,6 +240,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
|
|||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q4_1)
|
||||
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
|
||||
|
|
@ -245,6 +248,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
|
|||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q5_0)
|
||||
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
|
||||
|
|
@ -252,6 +256,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
|
|||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q5_1)
|
||||
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
|
||||
|
|
@ -259,10 +264,20 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
|
|||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q8_0)
|
||||
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_BF16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_BF16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_BF16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_BF16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_BF16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_BF16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16)
|
||||
#else
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16)
|
||||
#endif // GGML_CUDA_FA_ALL_QUANTS
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
@ -355,6 +370,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
|||
#endif // GGML_CUDA_FA_ALL_QUANTS
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_BF16:
|
||||
break;
|
||||
default:
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
#include "gated_delta_net.cuh"
|
||||
|
||||
template <int S_v, bool KDA>
|
||||
__global__ void gated_delta_net_cuda(const float * q,
|
||||
__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
|
||||
gated_delta_net_cuda(const float * q,
|
||||
const float * k,
|
||||
const float * v,
|
||||
const float * g,
|
||||
|
|
@ -38,7 +39,7 @@ __global__ void gated_delta_net_cuda(const float * q,
|
|||
|
||||
const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
|
||||
state += state_offset;
|
||||
curr_state += state_offset;
|
||||
curr_state += state_offset + col * S_v;
|
||||
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v;
|
||||
|
|
@ -46,10 +47,11 @@ __global__ void gated_delta_net_cuda(const float * q,
|
|||
constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
|
||||
float s_shard[rows_per_lane];
|
||||
// state is stored transposed: M[col][i] = S[i][col], row col is contiguous
|
||||
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
s_shard[r] = curr_state[col * S_v + i];
|
||||
s_shard[r] = curr_state[i];
|
||||
}
|
||||
|
||||
for (int t = 0; t < n_tokens; t++) {
|
||||
|
|
@ -63,6 +65,16 @@ __global__ void gated_delta_net_cuda(const float * q,
|
|||
|
||||
const float beta_val = *beta_t;
|
||||
|
||||
// Cache k and q in registers
|
||||
float k_reg[rows_per_lane];
|
||||
float q_reg[rows_per_lane];
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
k_reg[r] = k_t[i];
|
||||
q_reg[r] = q_t[i];
|
||||
}
|
||||
|
||||
if constexpr (!KDA) {
|
||||
const float g_val = expf(*g_t);
|
||||
|
||||
|
|
@ -70,8 +82,7 @@ __global__ void gated_delta_net_cuda(const float * q,
|
|||
float kv_shard = 0.0f;
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
kv_shard += s_shard[r] * k_t[i];
|
||||
kv_shard += s_shard[r] * k_reg[r];
|
||||
}
|
||||
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||
|
||||
|
|
@ -83,9 +94,8 @@ __global__ void gated_delta_net_cuda(const float * q,
|
|||
float attn_partial = 0.0f;
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
s_shard[r] = g_val * s_shard[r] + k_t[i] * delta_col;
|
||||
attn_partial += s_shard[r] * q_t[i];
|
||||
s_shard[r] = g_val * s_shard[r] + k_reg[r] * delta_col;
|
||||
attn_partial += s_shard[r] * q_reg[r];
|
||||
}
|
||||
|
||||
float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||
|
|
@ -99,7 +109,7 @@ __global__ void gated_delta_net_cuda(const float * q,
|
|||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
kv_shard += expf(g_t[i]) * s_shard[r] * k_t[i];
|
||||
kv_shard += expf(g_t[i]) * s_shard[r] * k_reg[r];
|
||||
}
|
||||
|
||||
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||
|
|
@ -113,8 +123,8 @@ __global__ void gated_delta_net_cuda(const float * q,
|
|||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_t[i] * delta_col;
|
||||
attn_partial += s_shard[r] * q_t[i];
|
||||
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_reg[r] * delta_col;
|
||||
attn_partial += s_shard[r] * q_reg[r];
|
||||
}
|
||||
|
||||
float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||
|
|
|
|||
|
|
@ -126,7 +126,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|||
if (err == hipSuccess) {
|
||||
// hipMemAdviseSetCoarseGrain is an optional performance hint;
|
||||
// ignore errors (e.g. hipErrorInvalidValue on some APU/iGPU configs).
|
||||
cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
|
||||
(void)cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
|
||||
(void)hipGetLastError(); // clear any error
|
||||
}
|
||||
|
||||
|
|
@ -1297,7 +1297,12 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|||
const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
|
||||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
||||
|
||||
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
|
||||
const bool use_fp16 =
|
||||
src0->type != GGML_TYPE_NVFP4 &&
|
||||
(src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
ggml_is_contiguous(src0) &&
|
||||
row_diff == src0->ne[1] &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT;
|
||||
|
||||
if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
||||
|
|
@ -4781,6 +4786,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
#ifdef FP8_AVAILABLE
|
||||
case GGML_TYPE_NVFP4:
|
||||
#endif // FP8_AVAILABLE
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
|||
case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1;
|
||||
case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1;
|
||||
case GGML_TYPE_MXFP4: return vec_dot_mxfp4_q8_1;
|
||||
case GGML_TYPE_NVFP4: return vec_dot_nvfp4_q8_1;
|
||||
case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1;
|
||||
case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1;
|
||||
case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1;
|
||||
|
|
@ -33,7 +34,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
|||
}
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
|
||||
|
|
@ -41,6 +42,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
|||
case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_MXFP4: return VDR_MXFP4_Q8_1_MMVQ;
|
||||
case GGML_TYPE_NVFP4: return VDR_NVFP4_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ;
|
||||
|
|
@ -173,11 +175,11 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
|
|||
return 1;
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id) {
|
||||
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id, bool small_k = false, int nwarps = 1) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
return 1;
|
||||
return small_k ? nwarps : 1;
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
|
|
@ -193,7 +195,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
|
|||
return 1;
|
||||
}
|
||||
|
||||
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false>
|
||||
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false, bool small_k = false>
|
||||
__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mul_mat_vec_q(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
|
||||
|
|
@ -208,7 +210,7 @@ static __global__ void mul_mat_vec_q(
|
|||
constexpr int vdr = get_vdr_mmvq(type);
|
||||
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
|
||||
constexpr int nwarps = calc_nwarps(type, ncols_dst, table_id);
|
||||
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
|
||||
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id, small_k, nwarps);
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
||||
|
|
@ -414,14 +416,16 @@ static __global__ void mul_mat_vec_q(
|
|||
template<ggml_type type>
|
||||
static std::pair<dim3, dim3> calc_launch_params(
|
||||
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
|
||||
const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
|
||||
const int warp_size, const mmvq_parameter_table_id table_id, const bool small_k = false) {
|
||||
const int nwarps = calc_nwarps(type, ncols_dst, table_id);
|
||||
const int rpb = calc_rows_per_block(ncols_dst, table_id, small_k, nwarps);
|
||||
const int64_t nblocks = (nrows_x + rpb - 1) / rpb;
|
||||
const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens);
|
||||
const dim3 block_dims(warp_size, calc_nwarps(type, ncols_dst, table_id), 1);
|
||||
const dim3 block_dims(warp_size, nwarps, 1);
|
||||
return {block_nums, block_dims};
|
||||
}
|
||||
|
||||
template<ggml_type type, int c_ncols_dst, bool is_multi_token_id = false>
|
||||
template<ggml_type type, int c_ncols_dst, bool is_multi_token_id = false, bool small_k = false>
|
||||
static void mul_mat_vec_q_switch_fusion(
|
||||
const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
|
||||
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
|
||||
|
|
@ -434,7 +438,7 @@ static void mul_mat_vec_q_switch_fusion(
|
|||
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
|
||||
if constexpr (c_ncols_dst == 1) {
|
||||
if (has_fusion) {
|
||||
mul_mat_vec_q<type, c_ncols_dst, true, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
mul_mat_vec_q<type, c_ncols_dst, true, is_multi_token_id, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
|
|
@ -444,7 +448,7 @@ static void mul_mat_vec_q_switch_fusion(
|
|||
|
||||
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
|
||||
|
||||
mul_mat_vec_q<type, c_ncols_dst, false, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
mul_mat_vec_q<type, c_ncols_dst, false, is_multi_token_id, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
|
|
@ -488,11 +492,33 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
|||
switch (ncols_dst) {
|
||||
case 1: {
|
||||
constexpr int c_ncols_dst = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
|
||||
// When K is small, increase rows_per_block to match nwarps so each warp has more work to do
|
||||
// Trigger when the full thread block covers all K blocks in a single loop iteration and few threads remain idle.
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int vdr = get_vdr_mmvq(type);
|
||||
const int blocks_per_row_x = ncols_x / qk;
|
||||
const int blocks_per_iter_1warp = vdr * warp_size / qi;
|
||||
const int nwarps = calc_nwarps(type, c_ncols_dst, table_id);
|
||||
const bool use_small_k = nwarps > 1 && blocks_per_row_x < nwarps * blocks_per_iter_1warp;
|
||||
if (use_small_k) {
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst,
|
||||
warp_size, table_id, true);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, false, true>(
|
||||
vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
} else {
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst,
|
||||
warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(
|
||||
vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
}
|
||||
} break;
|
||||
case 2: {
|
||||
constexpr int c_ncols_dst = 2;
|
||||
|
|
@ -602,6 +628,12 @@ static void mul_mat_vec_q_switch_type(
|
|||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_NVFP4:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_NVFP4>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_BF16);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_F16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_F16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_F16);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q4_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q4_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q4_0);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q4_1);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q4_1);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q4_1);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q5_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q5_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q5_0);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q5_1);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q5_1);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q5_1);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q8_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q8_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q8_0);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_BF16);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_0, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_0, GGML_TYPE_BF16);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_1, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_1, GGML_TYPE_BF16);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_0, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_0, GGML_TYPE_BF16);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_1, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_1, GGML_TYPE_BF16);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_BF16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_BF16);
|
||||
|
|
@ -5,7 +5,7 @@ import os
|
|||
|
||||
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 576]
|
||||
|
||||
TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0"]
|
||||
TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_BF16"]
|
||||
|
||||
SOURCE_FATTN_TILE = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
|
|
|
|||
|
|
@ -322,6 +322,38 @@ static __device__ __forceinline__ float vec_dot_mxfp4_q8_1(
|
|||
return d * sumi;
|
||||
}
|
||||
|
||||
#define VDR_NVFP4_Q8_1_MMVQ 4
|
||||
#define VDR_NVFP4_Q8_1_MMQ 8
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_nvfp4_q8_1(
|
||||
const void * __restrict__ vbq,
|
||||
const block_q8_1 * __restrict__ bq8_1,
|
||||
const int32_t & kbx,
|
||||
const int32_t & iqs) {
|
||||
|
||||
const block_nvfp4 * bq4 = (const block_nvfp4 *) vbq + kbx;
|
||||
float sum = 0.0f;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VDR_NVFP4_Q8_1_MMVQ/2; i++) {
|
||||
const int32_t iqs0 = iqs + 2*i;
|
||||
const int32_t iqs1 = iqs0 + 1;
|
||||
const int32_t is = iqs0 >> 1;
|
||||
const int2 v0 = get_int_from_table_16(get_int_b4(bq4->qs, iqs0), kvalues_mxfp4);
|
||||
const int2 v1 = get_int_from_table_16(get_int_b4(bq4->qs, iqs1), kvalues_mxfp4);
|
||||
const block_q8_1 * bq8 = bq8_1 + (is >> 1);
|
||||
const int32_t i8 = ((is & 1) << 2);
|
||||
|
||||
int sumi = ggml_cuda_dp4a(v0.x, get_int_b4(bq8->qs, i8 + 0), 0);
|
||||
sumi = ggml_cuda_dp4a(v0.y, get_int_b4(bq8->qs, i8 + 2), sumi);
|
||||
sumi = ggml_cuda_dp4a(v1.x, get_int_b4(bq8->qs, i8 + 1), sumi);
|
||||
sumi = ggml_cuda_dp4a(v1.y, get_int_b4(bq8->qs, i8 + 3), sumi);
|
||||
|
||||
const float d = ggml_cuda_ue4m3_to_fp32(bq4->d[is]) * __low2float(bq8->ds);
|
||||
sum += d * float(sumi);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||
#define VDR_Q2_K_Q8_1_MMQ 4
|
||||
|
||||
|
|
|
|||
|
|
@ -6,9 +6,10 @@
|
|||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#if CUDART_VERSION >= 12050
|
||||
#if CUDART_VERSION >= 11080
|
||||
#include <cuda_fp8.h>
|
||||
#endif // CUDART_VERSION >= 12050
|
||||
#define FP8_AVAILABLE
|
||||
#endif // CUDART_VERSION >= 11080
|
||||
|
||||
#if CUDART_VERSION >= 12080
|
||||
#include <cuda_fp4.h>
|
||||
|
|
|
|||
|
|
@ -235,6 +235,12 @@
|
|||
typedef __hip_bfloat16 nv_bfloat16;
|
||||
typedef __hip_bfloat162 nv_bfloat162;
|
||||
|
||||
#if HIP_VERSION >= 60200000
|
||||
#include <hip/hip_fp8.h>
|
||||
typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
|
||||
#define FP8_AVAILABLE
|
||||
#endif // HIP_VERSION >= 60200000
|
||||
|
||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ static int opt_verbose = 0;
|
|||
static int opt_profile = 0;
|
||||
static int opt_hostbuf = 1; // hostbuf ON by default
|
||||
static int opt_experimental = 0;
|
||||
static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
|
||||
|
||||
// Enable all stages by default
|
||||
static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
|
||||
|
|
@ -460,7 +461,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|||
d[7] = x[i * 8 + 7].d;
|
||||
}
|
||||
|
||||
if (opt_verbose > 1) {
|
||||
if (opt_verbose > 2) {
|
||||
for (int i = 0; i < nb; i++) {
|
||||
dump_packed_block_q4x4x2(y, i, k);
|
||||
}
|
||||
|
|
@ -479,7 +480,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|||
const uint8_t * y_q = y + 0; // quants first
|
||||
const uint8_t * y_d = y + qrow_size; // then scales
|
||||
|
||||
if (opt_verbose > 1) {
|
||||
if (opt_verbose > 2) {
|
||||
for (int i = 0; i < nb; i++) {
|
||||
dump_packed_block_q4x4x2(y, i, k);
|
||||
}
|
||||
|
|
@ -795,7 +796,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
|
|||
d[7] = x[i * 8 + 7].d;
|
||||
}
|
||||
|
||||
if (opt_verbose > 1) {
|
||||
if (opt_verbose > 2) {
|
||||
for (int i = 0; i < nb; i++) {
|
||||
dump_packed_block_q8x4x2(y, i, k);
|
||||
}
|
||||
|
|
@ -813,7 +814,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
|
|||
const uint8_t * y_q = y + 0; // quants first
|
||||
const uint8_t * y_d = y + qrow_size; // then scales
|
||||
|
||||
if (opt_verbose > 1) {
|
||||
if (opt_verbose > 2) {
|
||||
for (int i = 0; i < nb; i++) {
|
||||
dump_packed_block_q8x4x2(y, i, k);
|
||||
}
|
||||
|
|
@ -1148,7 +1149,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|||
e[7] = x[i * 8 + 7].e;
|
||||
}
|
||||
|
||||
if (opt_verbose > 1) {
|
||||
if (opt_verbose > 2) {
|
||||
for (int i = 0; i < nb; i++) {
|
||||
dump_packed_block_mxfp4x4x2(y, i, k);
|
||||
}
|
||||
|
|
@ -1167,7 +1168,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
|||
const uint8_t * y_q = y + 0; // quants first
|
||||
const uint8_t * y_e = y + qrow_size; // then scales
|
||||
|
||||
if (opt_verbose > 1) {
|
||||
if (opt_verbose > 2) {
|
||||
for (int i = 0; i < nb; i++) {
|
||||
dump_packed_block_mxfp4x4x2(y, i, k);
|
||||
}
|
||||
|
|
@ -1405,6 +1406,13 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
repack_q8_0_q8x4x2(tensor, data, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
// IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
|
||||
repack_q4_0_q4x4x2(tensor, data, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_MXFP4:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
|
|
@ -1441,6 +1449,12 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|||
repack_q8x4x2_q8_0(data, tensor, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
repack_q4x4x2_q4_0(data, tensor, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_MXFP4:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
|
|
@ -1693,7 +1707,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|||
// Start the DSP-side service. We need to pass the queue ID to the
|
||||
// DSP in a FastRPC call; the DSP side will import the queue and start
|
||||
// listening for packets in a callback.
|
||||
err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx);
|
||||
err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
|
||||
if (err != 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
|
||||
throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
|
||||
|
|
@ -1818,6 +1832,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
if (src0->ne[0] % 32) {
|
||||
return false;
|
||||
|
|
@ -1867,6 +1882,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
|||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
if ((src0->ne[0] % 32)) {
|
||||
return false;
|
||||
|
|
@ -2362,6 +2378,27 @@ static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs,
|
|||
return n_bufs;
|
||||
}
|
||||
|
||||
static inline size_t init_cont_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
// CONT is just a contiguous copy — reuse CPY op
|
||||
req->op = HTP_OP_CPY;
|
||||
|
||||
size_t n_bufs = 0;
|
||||
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
||||
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
static inline size_t init_repeat_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
req->op = HTP_OP_REPEAT;
|
||||
|
||||
size_t n_bufs = 0;
|
||||
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
||||
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
req->op = HTP_OP_GET_ROWS;
|
||||
|
||||
|
|
@ -2449,12 +2486,33 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
|
|||
break;
|
||||
|
||||
case GGML_OP_UNARY:
|
||||
if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
|
||||
switch (ggml_get_unary_op(t)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
req->op = HTP_OP_UNARY_SILU;
|
||||
supported = true;
|
||||
} else if (ggml_get_unary_op(t) == GGML_UNARY_OP_GELU) {
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
req->op = HTP_OP_UNARY_GELU;
|
||||
supported = true;
|
||||
break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
req->op = HTP_OP_UNARY_SIGMOID;
|
||||
supported = true;
|
||||
break;
|
||||
case GGML_UNARY_OP_NEG:
|
||||
req->op = HTP_OP_UNARY_NEG;
|
||||
supported = true;
|
||||
break;
|
||||
case GGML_UNARY_OP_EXP:
|
||||
req->op = HTP_OP_UNARY_EXP;
|
||||
supported = true;
|
||||
break;
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
req->op = HTP_OP_UNARY_SOFTPLUS;
|
||||
supported = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
|
|
@ -2553,8 +2611,26 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
|||
delete backend;
|
||||
}
|
||||
|
||||
// Map weight type to its activation quantization family.
|
||||
// Types in the same family produce identical Q8 formats in VTCM and can
|
||||
// safely share quantized activation data via SKIP_QUANTIZE.
|
||||
// When adding a new quantized type, assign it the correct family here.
|
||||
static inline int act_quant_family(enum ggml_type wtype) {
|
||||
switch (wtype) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
return 1; // Q8x4x2
|
||||
default:
|
||||
return 0; // unknown / not quantized
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
|
||||
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
|
||||
return (op0 && op0->src[1] == op1->src[1] &&
|
||||
act_quant_family(op0->src[0]->type) == act_quant_family(op1->src[0]->type) &&
|
||||
act_quant_family(op0->src[0]->type) != 0);
|
||||
}
|
||||
|
||||
static inline bool is_compute_op(ggml_tensor *node)
|
||||
|
|
@ -2640,16 +2716,28 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|||
ggml_hexagon_dispatch_op<init_sum_rows_req>(sess, node, flags);
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
|
||||
(ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
|
||||
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
||||
switch (ggml_get_unary_op(node)) {
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
|
||||
(ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI) ||
|
||||
(ggml_get_glu_op(node) == GGML_GLU_OP_GEGLU)) {
|
||||
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
||||
switch (ggml_get_glu_op(node)) {
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
|
|
@ -2676,6 +2764,14 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|||
ggml_hexagon_dispatch_op<init_cpy_req>(sess, node, flags);
|
||||
break;
|
||||
|
||||
case GGML_OP_CONT:
|
||||
ggml_hexagon_dispatch_op<init_cont_req>(sess, node, flags);
|
||||
break;
|
||||
|
||||
case GGML_OP_REPEAT:
|
||||
ggml_hexagon_dispatch_op<init_repeat_req>(sess, node, flags);
|
||||
break;
|
||||
|
||||
case GGML_OP_ARGSORT:
|
||||
ggml_hexagon_dispatch_op<init_argsort_req>(sess, node, flags);
|
||||
break;
|
||||
|
|
@ -3006,6 +3102,39 @@ static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_cont(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
GGML_UNUSED(sess);
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
|
||||
// CONT is same-type only, supports f32 and f16
|
||||
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
GGML_UNUSED(sess);
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
// Support f32 and f16
|
||||
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
|
||||
|
||||
// src and dst must be the same type
|
||||
if (src0->type != dst->type) return false;
|
||||
|
||||
// dst dims must be multiples of src dims
|
||||
if (dst->ne[0] % src0->ne[0] != 0) return false;
|
||||
if (dst->ne[1] % src0->ne[1] != 0) return false;
|
||||
if (dst->ne[2] % src0->ne[2] != 0) return false;
|
||||
if (dst->ne[3] % src0->ne[3] != 0) return false;
|
||||
|
||||
// require contiguous tensors (no transposition)
|
||||
if (ggml_is_transposed(src0) || ggml_is_transposed(dst)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
||||
|
||||
|
|
@ -3063,21 +3192,32 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|||
break;
|
||||
|
||||
case GGML_OP_UNARY:
|
||||
{
|
||||
const auto unary_op = ggml_get_unary_op(op);
|
||||
if (unary_op == GGML_UNARY_OP_SILU || unary_op == GGML_UNARY_OP_GELU) {
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
supp = ggml_hexagon_supported_unary(sess, op);
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
supp = ggml_hexagon_supported_activations(sess, op);
|
||||
}
|
||||
break;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
{
|
||||
const auto glu_op = ggml_get_glu_op(op);
|
||||
if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI) || (glu_op == GGML_GLU_OP_GEGLU)) {
|
||||
switch (ggml_get_glu_op(op)) {
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
supp = ggml_hexagon_supported_activations(sess, op);
|
||||
}
|
||||
break;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_ROPE:
|
||||
supp = ggml_hexagon_supported_rope(sess, op);
|
||||
break;
|
||||
|
|
@ -3098,6 +3238,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|||
supp = ggml_hexagon_supported_cpy(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_CONT:
|
||||
supp = ggml_hexagon_supported_cont(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_REPEAT:
|
||||
supp = ggml_hexagon_supported_repeat(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_ARGSORT:
|
||||
supp = ggml_hexagon_supported_argsort(sess, op);
|
||||
break;
|
||||
|
|
@ -3249,6 +3397,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
|||
"please update hexagon_type to match ggml_type");
|
||||
static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
|
||||
"please update hexagon_type to match ggml_type");
|
||||
static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
|
||||
"please update hexagon_type to match ggml_type");
|
||||
|
||||
const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
|
||||
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
|
||||
|
|
@ -3258,6 +3408,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
|||
const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
|
||||
const char * str_etm = getenv("GGML_HEXAGON_ETM");
|
||||
const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
|
||||
const char * str_use_hmx = getenv("GGML_HEXAGON_USE_HMX");
|
||||
const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
|
||||
const char * str_arch = getenv("GGML_HEXAGON_ARCH");
|
||||
|
||||
|
|
@ -3267,8 +3418,9 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
|||
opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask;
|
||||
opt_opsync = str_opsync ? atoi(str_opsync) : 0;
|
||||
opt_profile = str_profile ? atoi(str_profile) : 0;
|
||||
opt_etm = str_etm ? atoi(str_etm) : 0;
|
||||
opt_etm = str_etm ? atoi(str_etm) : 0;
|
||||
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
|
||||
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
|
||||
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
|
||||
|
||||
if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ add_library(${HTP_LIB} SHARED
|
|||
set-rows-ops.c
|
||||
get-rows-ops.c
|
||||
cpy-ops.c
|
||||
repeat-ops.c
|
||||
argsort-ops.c
|
||||
ssm-conv.c
|
||||
)
|
||||
|
|
@ -39,6 +40,24 @@ target_compile_definitions(${HTP_LIB} PRIVATE
|
|||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
|
||||
FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
|
||||
|
||||
# HMX acceleration: available on v73+ architectures
|
||||
set(HTP_HMX_VERSIONS v73 v75 v79 v81)
|
||||
list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
|
||||
|
||||
if (_hmx_idx GREATER_EQUAL 0)
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-matmul-ops.c
|
||||
)
|
||||
|
||||
# -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
|
||||
set_source_files_properties(
|
||||
hmx-matmul-ops.c
|
||||
PROPERTIES COMPILE_OPTIONS "-mhmx"
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1)
|
||||
endif()
|
||||
|
||||
build_idl(htp_iface.idl ${HTP_LIB})
|
||||
|
||||
set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)
|
||||
|
|
|
|||
|
|
@ -24,28 +24,26 @@
|
|||
// Context for binary operations
|
||||
struct htp_binary_context {
|
||||
struct htp_ops_context * octx;
|
||||
struct fastdiv_values dim1_div;
|
||||
struct fastdiv_values dim2_div;
|
||||
struct fastdiv_values dim12_div;
|
||||
|
||||
struct fastdiv_values src0_dim1_div; // ne01
|
||||
struct fastdiv_values src0_dim2_div; // ne02
|
||||
struct fastdiv_values src0_dim12_div;// ne03
|
||||
|
||||
struct fastdiv_values src1_dim1_div; // ne11
|
||||
struct fastdiv_values src1_dim2_div; // ne12
|
||||
struct fastdiv_values src1_dim3_div; // ne13
|
||||
|
||||
uint32_t nrows_per_thread;
|
||||
bool split_at_ne01;
|
||||
bool split_at_ne02;
|
||||
|
||||
// Precomputed values
|
||||
uint32_t block_max;
|
||||
uint32_t nrows_per_thread;
|
||||
size_t src0_row_size_aligned;
|
||||
size_t src1_row_size_aligned;
|
||||
size_t dst_row_size_aligned;
|
||||
uint32_t src1_fetch_rows; // 1 or block_max
|
||||
uint32_t src1_dma_stride; // 0 or stride
|
||||
|
||||
bool split_at_ne01;
|
||||
bool split_at_ne02;
|
||||
};
|
||||
|
||||
#define htp_binary_preamble \
|
||||
#define htp_binary_preamble \
|
||||
const struct htp_tensor * src0 = &octx->src0; \
|
||||
const struct htp_tensor * src1 = &octx->src1; \
|
||||
struct htp_tensor * dst = &octx->dst; \
|
||||
|
|
@ -72,12 +70,11 @@ struct htp_binary_context {
|
|||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3];
|
||||
|
||||
static inline uint32_t calc_block_size(struct htp_binary_context * bctx, uint32_t ir, uint32_t end_row,
|
||||
uint32_t ne01, uint32_t ne02) {
|
||||
static inline uint32_t calc_block_size(struct htp_binary_context * bctx, uint32_t ir, uint32_t end_row, uint32_t ne01, uint32_t ne02) {
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir, &bctx->dim12_div);
|
||||
i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
||||
rem = ir - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
|
||||
uint32_t rows_left = end_row - ir;
|
||||
|
|
@ -191,6 +188,8 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|||
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
||||
if (start_row >= end_row) return;
|
||||
|
||||
FARF(HIGH, "binary-scalar: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
||||
|
||||
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
||||
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
||||
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
||||
|
|
@ -204,9 +203,9 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|||
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
||||
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
|
||||
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
||||
|
|
@ -215,7 +214,7 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|||
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
||||
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
|
||||
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
||||
ir_prefetch += current_block_size;
|
||||
spad_idx ^= 1;
|
||||
|
|
@ -229,9 +228,9 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|||
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
||||
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir, &bctx->dim12_div);
|
||||
i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
||||
rem = ir - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
|
||||
// src1 indices (broadcast/repeat)
|
||||
|
|
@ -255,9 +254,9 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|||
if (ir_prefetch < end_row) {
|
||||
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t p03, p02, p01, prem;
|
||||
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
p02 = fastdiv(prem, &bctx->dim1_div);
|
||||
p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
||||
p01 = prem - p02 * ne01;
|
||||
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
||||
|
||||
|
|
@ -282,6 +281,8 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|||
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
||||
if (start_row >= end_row) return;
|
||||
|
||||
FARF(HIGH, "binary-same-shape: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
||||
|
||||
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
||||
uint8_t * src1_spad_base = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
|
||||
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
||||
|
|
@ -297,9 +298,9 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|||
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
||||
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
|
||||
uint32_t i13 = (ne13 == 1) ? 0 : i03;
|
||||
|
|
@ -307,23 +308,23 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|||
uint32_t i11 = (ne11 == 1) ? 0 : i01;
|
||||
|
||||
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
||||
uint8_t * src1_base = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
|
||||
uint8_t * src1_curr = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
|
||||
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
||||
|
||||
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
||||
uint8_t * s1_spad = src1_spad_base + spad_idx * src1_spad_half;
|
||||
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
|
||||
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
||||
dma_queue_push(q, dma_make_ptr(s1_spad, src1_base), bctx->src1_row_size_aligned, bctx->src1_dma_stride, row_size_bytes, current_block_size);
|
||||
dma_queue_push(q, dma_make_ptr(s1_spad, src1_curr), bctx->src1_row_size_aligned, nb11, row_size_bytes, current_block_size);
|
||||
ir_prefetch += current_block_size;
|
||||
spad_idx ^= 1;
|
||||
}
|
||||
|
||||
for (uint32_t ir = start_row; ir < end_row; ) {
|
||||
uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
|
||||
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
||||
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
||||
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
||||
uint8_t * s1_spad = (uint8_t *) dma_queue_pop(q).dst;
|
||||
|
||||
|
|
@ -335,9 +336,9 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|||
}
|
||||
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir, &bctx->dim12_div);
|
||||
i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
||||
rem = ir - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
||||
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
|
||||
|
|
@ -345,9 +346,9 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|||
if (ir_prefetch < end_row) {
|
||||
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t p03, p02, p01, prem;
|
||||
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
p02 = fastdiv(prem, &bctx->dim1_div);
|
||||
p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
||||
p01 = prem - p02 * ne01;
|
||||
|
||||
uint32_t p13 = (ne13 == 1) ? 0 : p03;
|
||||
|
|
@ -358,7 +359,7 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|||
uint8_t * s1_next = (uint8_t *)src1->data + p13 * nb13 + p12 * nb12 + p11 * nb11;
|
||||
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
|
||||
dma_queue_push(q, dma_make_ptr(s1_spad, s1_next), bctx->src1_row_size_aligned, bctx->src1_dma_stride, row_size_bytes, next_block_size);
|
||||
dma_queue_push(q, dma_make_ptr(s1_spad, s1_next), bctx->src1_row_size_aligned, nb11, row_size_bytes, next_block_size);
|
||||
|
||||
ir_prefetch += next_block_size;
|
||||
}
|
||||
|
|
@ -373,15 +374,17 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|||
struct htp_ops_context * octx = bctx->octx;
|
||||
htp_binary_preamble;
|
||||
|
||||
const uint32_t src0_type = octx->src0.type;
|
||||
const uint32_t src0_type = octx->src0.type;
|
||||
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
|
||||
const uint32_t total_rows = ne01 * ne02 * ne03;
|
||||
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
||||
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
||||
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
||||
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
||||
if (start_row >= end_row) return;
|
||||
|
||||
FARF(HIGH, "binary-row-bcast: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
||||
|
||||
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
||||
uint8_t * src1_spad = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
|
||||
uint8_t * src1_spad_base = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
|
||||
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
||||
|
||||
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
||||
|
|
@ -391,15 +394,14 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|||
uint32_t ir_prefetch = start_row;
|
||||
int spad_idx = 0;
|
||||
|
||||
void * s1_ptr = (void *) src1_spad;
|
||||
void * s1_ptr = (void *) src1_spad_base;
|
||||
|
||||
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
||||
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
uint32_t i01 = rem - i02 * ne01;
|
||||
|
||||
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
||||
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
||||
|
|
@ -407,7 +409,7 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|||
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
||||
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
|
||||
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
||||
ir_prefetch += current_block_size;
|
||||
spad_idx ^= 1;
|
||||
|
|
@ -415,7 +417,7 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|||
|
||||
for (uint32_t ir = start_row; ir < end_row; ) {
|
||||
uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
|
||||
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
||||
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
||||
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
||||
|
||||
for (uint32_t r = 0; r < current_block_size; r++) {
|
||||
|
|
@ -425,21 +427,19 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|||
COMPUTE_VECTOR_OP_AAA(r_dst, r_src0, r_src1, src0_type, ne00);
|
||||
}
|
||||
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir, &bctx->dim12_div);
|
||||
rem = ir - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
||||
uint32_t rem = ir - i03 * (ne02 * ne01);
|
||||
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
uint32_t i01 = rem - i02 * ne01;
|
||||
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
||||
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
|
||||
|
||||
if (ir_prefetch < end_row) {
|
||||
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t p03, p02, p01, prem;
|
||||
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
p02 = fastdiv(prem, &bctx->dim1_div);
|
||||
p01 = prem - p02 * ne01;
|
||||
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
||||
uint32_t p01 = prem - p02 * ne01;
|
||||
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
|
||||
ir_prefetch += next_block_size;
|
||||
|
|
@ -458,14 +458,16 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|||
const uint32_t src0_type = octx->src0.type;
|
||||
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
|
||||
const uint32_t total_rows = ne01 * ne02 * ne03;
|
||||
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
||||
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
||||
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
||||
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
||||
if (start_row >= end_row) return;
|
||||
|
||||
FARF(HIGH, "binary-complex: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
||||
|
||||
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
||||
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
||||
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
||||
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
|
||||
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
||||
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
|
||||
|
||||
dma_queue * q = octx->ctx->dma[ith];
|
||||
uint32_t ir_prefetch = start_row;
|
||||
|
|
@ -473,11 +475,10 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|||
|
||||
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
||||
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
uint32_t i01 = rem - i02 * ne01;
|
||||
|
||||
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
||||
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
||||
|
|
@ -485,7 +486,7 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|||
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
||||
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
|
||||
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
||||
ir_prefetch += current_block_size;
|
||||
spad_idx ^= 1;
|
||||
|
|
@ -496,11 +497,10 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|||
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
||||
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
||||
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir, &bctx->dim12_div);
|
||||
rem = ir - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
||||
uint32_t rem = ir - i03 * (ne02 * ne01);
|
||||
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
uint32_t i01 = rem - i02 * ne01;
|
||||
|
||||
for (uint32_t r = 0; r < current_block_size; r++) {
|
||||
uint32_t r_i01 = i01 + r;
|
||||
|
|
@ -521,11 +521,10 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|||
|
||||
if (ir_prefetch < end_row) {
|
||||
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t p03, p02, p01, prem;
|
||||
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
p02 = fastdiv(prem, &bctx->dim1_div);
|
||||
p01 = prem - p02 * ne01;
|
||||
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
||||
uint32_t p01 = prem - p02 * ne01;
|
||||
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
|
||||
ir_prefetch += next_block_size;
|
||||
|
|
@ -545,14 +544,16 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|||
const uint32_t elem_size_bytes = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
|
||||
const uint32_t row_size_bytes = ne00 * elem_size_bytes;;
|
||||
const uint32_t total_rows = ne01 * ne02 * ne03;
|
||||
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
||||
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
||||
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
||||
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
||||
if (start_row >= end_row) return;
|
||||
|
||||
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
||||
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
||||
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
||||
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
|
||||
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
||||
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
|
||||
|
||||
FARF(HIGH, "binary-repeat: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
||||
|
||||
dma_queue * q = octx->ctx->dma[ith];
|
||||
uint32_t ir_prefetch = start_row;
|
||||
|
|
@ -560,11 +561,10 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|||
|
||||
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
||||
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
uint32_t i01 = rem - i02 * ne01;
|
||||
|
||||
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
||||
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
||||
|
|
@ -572,7 +572,7 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|||
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
||||
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
|
||||
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
||||
ir_prefetch += current_block_size;
|
||||
spad_idx ^= 1;
|
||||
|
|
@ -583,11 +583,10 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|||
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
||||
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
||||
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir, &bctx->dim12_div);
|
||||
rem = ir - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
||||
uint32_t rem = ir - i03 * (ne02 * ne01);
|
||||
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
uint32_t i01 = rem - i02 * ne01;
|
||||
|
||||
for (uint32_t r = 0; r < current_block_size; r++) {
|
||||
uint32_t r_i01 = i01 + r;
|
||||
|
|
@ -612,11 +611,10 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|||
|
||||
if (ir_prefetch < end_row) {
|
||||
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t p03, p02, p01, prem;
|
||||
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
p02 = fastdiv(prem, &bctx->dim1_div);
|
||||
p01 = prem - p02 * ne01;
|
||||
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
||||
uint32_t p01 = prem - p02 * ne01;
|
||||
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
|
||||
ir_prefetch += next_block_size;
|
||||
|
|
@ -646,6 +644,7 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|||
const uint32_t nb02 = src0->nb[2];
|
||||
const uint32_t nb03 = src0->nb[3];
|
||||
const uint32_t nb11 = src1->nb[1]; // src1 row stride
|
||||
|
||||
const uint32_t nb1 = dst->nb[1];
|
||||
const uint32_t nb2 = dst->nb[2];
|
||||
const uint32_t nb3 = dst->nb[3];
|
||||
|
|
@ -657,8 +656,8 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|||
|
||||
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
||||
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
||||
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
||||
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
|
||||
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
||||
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
|
||||
|
||||
dma_queue * q = octx->ctx->dma[ith];
|
||||
uint32_t ir_prefetch = start_row;
|
||||
|
|
@ -666,11 +665,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|||
|
||||
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
||||
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
|
||||
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
uint32_t i01 = rem - i02 * ne01;
|
||||
|
||||
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
||||
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
||||
|
|
@ -678,7 +676,7 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|||
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
||||
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
|
||||
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, ne00 * sizeof(float), 0);
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), current_block_size);
|
||||
ir_prefetch += current_block_size;
|
||||
spad_idx ^= 1;
|
||||
|
|
@ -689,11 +687,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|||
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
||||
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
||||
|
||||
uint32_t i03, i02, i01, rem;
|
||||
i03 = fastdiv(ir, &bctx->dim12_div);
|
||||
rem = ir - i03 * (ne02 * ne01);
|
||||
i02 = fastdiv(rem, &bctx->dim1_div);
|
||||
i01 = rem - i02 * ne01;
|
||||
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
||||
uint32_t rem = ir - i03 * (ne02 * ne01);
|
||||
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
||||
uint32_t i01 = rem - i02 * ne01;
|
||||
|
||||
for (uint32_t r = 0; r < current_block_size; r++) {
|
||||
uint32_t r_i01 = i01 + r; // linear within block since we split at ne01
|
||||
|
|
@ -712,11 +709,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|||
|
||||
if (ir_prefetch < end_row) {
|
||||
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
||||
uint32_t p03, p02, p01, prem;
|
||||
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
|
||||
prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
p02 = fastdiv(prem, &bctx->dim1_div);
|
||||
p01 = prem - p02 * ne01;
|
||||
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
||||
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
|
||||
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
||||
uint32_t p01 = prem - p02 * ne01;
|
||||
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
||||
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), next_block_size);
|
||||
ir_prefetch += next_block_size;
|
||||
|
|
@ -739,40 +735,36 @@ static int execute_op_binary(struct htp_ops_context * octx) {
|
|||
const size_t elem_size = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
|
||||
const size_t src0_row_size = src0->ne[0] * elem_size;
|
||||
const size_t src1_row_size = src1->ne[0] * elem_size;
|
||||
const size_t dst_row_size = dst->ne[0] * elem_size;
|
||||
const size_t dst_row_size = dst->ne[0] * elem_size;
|
||||
|
||||
// Align to VLEN
|
||||
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
|
||||
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
||||
size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
|
||||
size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
|
||||
size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
||||
|
||||
bool is_add_id = (octx->op == HTP_OP_ADD_ID);
|
||||
bool is_scalar = !is_add_id && (src1->ne[0] == 1);
|
||||
|
||||
// Determine which kernel we will use to alloc memory and dispatch
|
||||
bool use_vector_same = !is_add_id && !is_scalar && ((src0->nb[1] % VLEN) == 0) && (src1->ne[0] == src0->ne[0]) &&
|
||||
bool is_transposed = (src0->nb[1] < src0_row_size || src1->nb[1] < src1_row_size || dst->nb[1] < dst_row_size);
|
||||
|
||||
bool is_same_shape = !is_add_id && !is_scalar && !is_transposed &&
|
||||
(src1->ne[0] == src0->ne[0] && src0->ne[0] % VLEN == 0) &&
|
||||
(src1->ne[1] == src0->ne[1] || src1->ne[1] == 1) &&
|
||||
(src1->ne[2] == src0->ne[2] || src1->ne[2] == 1) &&
|
||||
(src1->ne[3] == src0->ne[3] || src1->ne[3] == 1);
|
||||
|
||||
bool is_row_bcast = use_vector_same && (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1);
|
||||
bool use_complex = !is_add_id && !is_scalar && !use_vector_same && (src1->ne[0] == src0->ne[0]);
|
||||
bool use_repeat = !is_add_id && !is_scalar && !use_vector_same && (src1->ne[0] != src0->ne[0]);
|
||||
bool is_row_bcast = is_same_shape && (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1);
|
||||
bool is_complex = !is_add_id && !is_scalar && !is_same_shape && (src1->ne[0] == src0->ne[0]);
|
||||
bool is_repeat = !is_add_id && !is_scalar && !is_same_shape && (src1->ne[0] != src0->ne[0]);
|
||||
|
||||
size_t spad_row_total;
|
||||
if (is_scalar) {
|
||||
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
|
||||
} else if (is_row_bcast) {
|
||||
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
|
||||
} else if (use_vector_same) {
|
||||
if (is_same_shape) {
|
||||
spad_row_total = 2 * (src0_row_size_aligned + src1_row_size_aligned + dst_row_size_aligned);
|
||||
} else if (is_add_id) {
|
||||
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned); // src1 read directly
|
||||
} else {
|
||||
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
|
||||
}
|
||||
|
||||
size_t rows_per_buffer = octx->ctx->vtcm_size / (n_threads * spad_row_total);
|
||||
|
||||
// Adjust for static src1 in row_bcast case
|
||||
if (is_row_bcast) {
|
||||
size_t needed_static = src1_row_size_aligned;
|
||||
|
|
@ -782,28 +774,26 @@ static int execute_op_binary(struct htp_ops_context * octx) {
|
|||
}
|
||||
|
||||
if (rows_per_buffer < 1) {
|
||||
FARF(ERROR, "binary: VTCM too small\n");
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
FARF(ERROR, "binary: VTCM too small\n");
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
}
|
||||
|
||||
octx->src0_spad.size_per_thread = rows_per_buffer * 2 * src0_row_size_aligned;
|
||||
octx->dst_spad.size_per_thread = rows_per_buffer * 2 * dst_row_size_aligned;
|
||||
|
||||
if (is_scalar || use_complex || use_repeat || is_add_id) {
|
||||
octx->src1_spad.size_per_thread = 0;
|
||||
} else if (is_row_bcast) {
|
||||
if (is_add_id || is_scalar || is_complex || is_repeat || is_row_bcast) {
|
||||
octx->src1_spad.size_per_thread = 0;
|
||||
} else {
|
||||
octx->src1_spad.size_per_thread = rows_per_buffer * 2 * src1_row_size_aligned;
|
||||
}
|
||||
|
||||
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
||||
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
||||
if (is_row_bcast) {
|
||||
octx->src1_spad.size = src1_row_size_aligned;
|
||||
} else {
|
||||
octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
|
||||
}
|
||||
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
||||
|
||||
if (octx->ctx->vtcm_size < (octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size)) {
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
|
|
@ -823,46 +813,37 @@ static int execute_op_binary(struct htp_ops_context * octx) {
|
|||
}
|
||||
|
||||
struct htp_binary_context bctx;
|
||||
bctx.octx = octx;
|
||||
bctx.nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
|
||||
bctx.block_max = rows_per_buffer;
|
||||
bctx.octx = octx;
|
||||
bctx.nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
|
||||
bctx.block_max = rows_per_buffer;
|
||||
bctx.src0_row_size_aligned = src0_row_size_aligned;
|
||||
bctx.src1_row_size_aligned = src1_row_size_aligned;
|
||||
bctx.dst_row_size_aligned = dst_row_size_aligned;
|
||||
|
||||
bctx.dim1_div = init_fastdiv_values(src0->ne[1]);
|
||||
bctx.dim2_div = init_fastdiv_values(src0->ne[2]);
|
||||
bctx.dim12_div = init_fastdiv_values(src0->ne[1] * src0->ne[2]);
|
||||
bctx.src0_dim1_div = init_fastdiv_values(src0->ne[1]);
|
||||
bctx.src0_dim2_div = init_fastdiv_values(src0->ne[2]);
|
||||
bctx.src0_dim12_div = init_fastdiv_values(src0->ne[1] * src0->ne[2]);
|
||||
|
||||
bctx.src1_dim1_div = init_fastdiv_values(src1->ne[1]);
|
||||
bctx.src1_dim2_div = init_fastdiv_values(src1->ne[2]);
|
||||
bctx.src1_dim3_div = init_fastdiv_values(src1->ne[3]);
|
||||
bctx.src1_dim1_div = init_fastdiv_values(src1->ne[1]);
|
||||
bctx.src1_dim2_div = init_fastdiv_values(src1->ne[2]);
|
||||
bctx.src1_dim3_div = init_fastdiv_values(src1->ne[3]);
|
||||
|
||||
bool src0_contig_dim1 = (src0->nb[2] == src0->ne[1] * src0->nb[1]);
|
||||
bool dst_contig_dim1 = (dst->nb[2] == src0->ne[1] * dst->nb[1]);
|
||||
bool dst_contig_dim1 = (dst->nb[2] == src0->ne[1] * dst->nb[1]);
|
||||
|
||||
bool src0_contig_dim2 = (src0->nb[3] == src0->ne[2] * src0->nb[2]);
|
||||
bool dst_contig_dim2 = (dst->nb[3] == src0->ne[2] * dst->nb[2]);
|
||||
bool dst_contig_dim2 = (dst->nb[3] == src0->ne[2] * dst->nb[2]);
|
||||
|
||||
bctx.split_at_ne01 = (src0->ne[2] > 1) &&
|
||||
((src1->ne[1] > 1) || (src1->ne[2] > 1) || !src0_contig_dim1 || !dst_contig_dim1);
|
||||
|
||||
bctx.split_at_ne02 = (src0->ne[3] > 1) &&
|
||||
((src1->ne[2] > 1) || (src1->ne[3] > 1) || !src0_contig_dim2 || !dst_contig_dim2);
|
||||
|
||||
// Precompute specific kernel parameters
|
||||
if (use_vector_same) {
|
||||
bctx.src1_dma_stride = (src1->ne[1] == 1) ? 0 : src1->nb[1];
|
||||
bctx.src1_fetch_rows = (src1->ne[1] == 1) ? 1 : rows_per_buffer;
|
||||
}
|
||||
bctx.split_at_ne01 = (src0->ne[2] > 1) && ((src1->ne[1] > 1) || (src1->ne[2] > 1) || !src0_contig_dim1 || !dst_contig_dim1);
|
||||
bctx.split_at_ne02 = (src0->ne[3] > 1) && ((src1->ne[2] > 1) || (src1->ne[3] > 1) || !src0_contig_dim2 || !dst_contig_dim2);
|
||||
|
||||
worker_callback_t worker_func;
|
||||
if (is_add_id) worker_func = binary_job_add_id;
|
||||
else if (is_scalar) worker_func = binary_job_scalar;
|
||||
else if (is_row_bcast) worker_func = binary_job_vector_row_broadcast;
|
||||
else if (use_vector_same) worker_func = binary_job_vector_same_shape;
|
||||
else if (use_complex) worker_func = binary_job_vector_complex;
|
||||
else worker_func = binary_job_element_repeat;
|
||||
if (is_add_id) worker_func = binary_job_add_id;
|
||||
else if (is_scalar) worker_func = binary_job_scalar;
|
||||
else if (is_row_bcast) worker_func = binary_job_vector_row_broadcast;
|
||||
else if (is_same_shape) worker_func = binary_job_vector_same_shape;
|
||||
else if (is_complex) worker_func = binary_job_vector_complex;
|
||||
else worker_func = binary_job_element_repeat;
|
||||
|
||||
if (is_row_bcast) {
|
||||
dma_queue_pop(q);
|
||||
|
|
|
|||
|
|
@ -31,8 +31,8 @@ dma_queue * dma_queue_create(size_t capacity) {
|
|||
q->capacity = capacity;
|
||||
q->idx_mask = capacity - 1;
|
||||
|
||||
q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
|
||||
memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
|
||||
q->desc = (dma_descriptor_2d *) memalign(64, capacity * sizeof(dma_descriptor_2d));
|
||||
memset(q->desc, 0, capacity * sizeof(dma_descriptor_2d));
|
||||
|
||||
q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
|
||||
memset(q->dptr, 0, capacity * sizeof(dma_ptr));
|
||||
|
|
|
|||
|
|
@ -10,19 +10,84 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Define the HW descriptor structs here since the ones in HexSDK are a bit out of date
|
||||
typedef struct dma_descriptor_1d_s {
|
||||
void * next;
|
||||
uint32_t size:24;
|
||||
uint32_t desc_size:2;
|
||||
uint32_t dst_comp:1;
|
||||
uint32_t src_comp:1;
|
||||
uint32_t dst_bypass:1;
|
||||
uint32_t src_bypass:1;
|
||||
uint32_t order:1;
|
||||
uint32_t done:1;
|
||||
void * src;
|
||||
void * dst;
|
||||
} dma_descriptor_1d;
|
||||
|
||||
#if __HVX_ARCH__ < 75
|
||||
|
||||
typedef struct dma_descriptor_2d_s {
|
||||
void * next;
|
||||
uint32_t reserved0:24;
|
||||
uint32_t desc_size:2;
|
||||
uint32_t dst_comp:1;
|
||||
uint32_t src_comp:1;
|
||||
uint32_t dst_bypass:1;
|
||||
uint32_t src_bypass:1;
|
||||
uint32_t order:1;
|
||||
uint32_t done:1;
|
||||
void * src;
|
||||
void * dst;
|
||||
uint32_t desc_type:8;
|
||||
uint32_t reserved1:24;
|
||||
uint32_t row_size:16;
|
||||
uint32_t nrows:16;
|
||||
uint32_t src_stride:16;
|
||||
uint32_t dst_stride:16;
|
||||
uint32_t src_offset:16;
|
||||
uint32_t dst_offset:16;
|
||||
} dma_descriptor_2d;
|
||||
|
||||
#else
|
||||
|
||||
typedef struct dma_descriptor_2d_s {
|
||||
void * next;
|
||||
uint32_t dst_stride:24;
|
||||
uint32_t desc_size:2;
|
||||
uint32_t dst_comp:1;
|
||||
uint32_t src_comp:1;
|
||||
uint32_t dst_bypass:1;
|
||||
uint32_t src_bypass:1;
|
||||
uint32_t order:1;
|
||||
uint32_t done:1;
|
||||
void * src;
|
||||
void * dst;
|
||||
uint32_t desc_type:8;
|
||||
uint32_t reserved0:24;
|
||||
uint32_t row_size:24;
|
||||
uint32_t nrows_lo:8;
|
||||
uint32_t nrows_hi:8;
|
||||
uint32_t src_stride:24;
|
||||
uint32_t offset:24;
|
||||
uint32_t reserved1:8;
|
||||
} dma_descriptor_2d;
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
void *dst;
|
||||
void *dst;
|
||||
const void *src;
|
||||
} dma_ptr;
|
||||
|
||||
typedef struct {
|
||||
hexagon_udma_descriptor_type1_t * desc; // descriptor pointers
|
||||
hexagon_udma_descriptor_type1_t * tail; // tail pointer
|
||||
dma_ptr * dptr; // dst/src pointers
|
||||
uint32_t push_idx;
|
||||
uint32_t pop_idx;
|
||||
uint32_t capacity;
|
||||
uint32_t idx_mask;
|
||||
dma_descriptor_2d * desc; // descriptor pointers
|
||||
dma_descriptor_2d * tail; // tail pointer
|
||||
dma_ptr * dptr; // dst/src pointers
|
||||
uint32_t push_idx;
|
||||
uint32_t pop_idx;
|
||||
uint32_t capacity;
|
||||
uint32_t idx_mask;
|
||||
} dma_queue;
|
||||
|
||||
dma_queue * dma_queue_create(size_t capacity);
|
||||
|
|
@ -59,71 +124,87 @@ static inline dma_ptr dma_make_ptr(void *dst, const void *src)
|
|||
return p;
|
||||
}
|
||||
|
||||
static inline bool dma_queue_push(dma_queue * q,
|
||||
dma_ptr dptr,
|
||||
size_t dst_row_size,
|
||||
size_t src_row_size,
|
||||
size_t width, // width in bytes. number of bytes to transfer per row
|
||||
size_t nrows) {
|
||||
#if __HVX_ARCH__ < 73
|
||||
static const uint32_t dma_src_l2_bypass_on = 1;
|
||||
static const uint32_t dma_dst_l2_bypass_on = 0;
|
||||
#else
|
||||
static const uint32_t dma_src_l2_bypass_on = 1;
|
||||
static const uint32_t dma_dst_l2_bypass_on = 1;
|
||||
#endif
|
||||
|
||||
static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t size) {
|
||||
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
|
||||
FARF(ERROR, "dma-push: queue full\n");
|
||||
FARF(HIGH, "dma-push: queue full\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];
|
||||
dma_descriptor_1d * desc = (dma_descriptor_1d *) &q->desc[q->push_idx];
|
||||
desc->next = NULL;
|
||||
desc->desc_size = 0; // 1D mode
|
||||
desc->src_bypass = dma_src_l2_bypass_on;
|
||||
desc->dst_bypass = dma_dst_l2_bypass_on;
|
||||
desc->order = 1;
|
||||
desc->done = 0;
|
||||
desc->src = (void *) dptr.src;
|
||||
desc->dst = (void *) dptr.dst;
|
||||
desc->size = size;
|
||||
|
||||
q->dptr[q->push_idx] = dptr;
|
||||
|
||||
dmlink(q->tail, desc);
|
||||
q->tail = (dma_descriptor_2d *) desc;
|
||||
|
||||
// FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
|
||||
q->push_idx = (q->push_idx + 1) & q->idx_mask;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t row_size, size_t nrows) {
|
||||
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
|
||||
FARF(HIGH, "dma-push: queue full\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
dma_descriptor_2d * desc = &q->desc[q->push_idx];
|
||||
|
||||
desc->next = NULL;
|
||||
desc->length = 0;
|
||||
desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
|
||||
desc->dstbypass = 1;
|
||||
desc->srcbypass = 1;
|
||||
#if __HVX_ARCH__ >= 73
|
||||
desc->dstbypass = 1;
|
||||
desc->srcbypass = 1;
|
||||
#else
|
||||
desc->dstbypass = 0;
|
||||
desc->srcbypass = 1;
|
||||
#endif
|
||||
desc->order = 0;
|
||||
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
|
||||
desc->reserved0 = 0;
|
||||
desc->reserved1 = 0;
|
||||
desc->desc_size = 1; // 2d mode
|
||||
desc->src_bypass = dma_src_l2_bypass_on;
|
||||
desc->dst_bypass = dma_dst_l2_bypass_on;
|
||||
desc->src_comp = 0;
|
||||
desc->dst_comp = 0;
|
||||
desc->order = 1;
|
||||
desc->done = 0;
|
||||
desc->src_stride = src_stride;
|
||||
desc->dst_stride = dst_stride;
|
||||
desc->src = (void *) dptr.src;
|
||||
desc->dst = (void *) dptr.dst;
|
||||
desc->allocation = 0;
|
||||
desc->padding = 0;
|
||||
desc->roiwidth = width;
|
||||
desc->roiheight = nrows;
|
||||
desc->srcstride = src_row_size;
|
||||
desc->dststride = dst_row_size;
|
||||
desc->srcwidthoffset = 0;
|
||||
desc->dstwidthoffset = 0;
|
||||
desc->row_size = row_size;
|
||||
|
||||
#if __HVX_ARCH__ < 75
|
||||
desc->desc_type = 0; // 2d (16-bit) mode
|
||||
desc->nrows = nrows;
|
||||
desc->src_offset = 0;
|
||||
desc->dst_offset = 0;
|
||||
#else
|
||||
desc->desc_type = 9; // 2d (24-bit) mode
|
||||
desc->nrows_lo = (nrows & 0xff);
|
||||
desc->nrows_hi = (nrows >> 8);
|
||||
desc->offset = 0;
|
||||
#endif
|
||||
|
||||
q->dptr[q->push_idx] = dptr;
|
||||
|
||||
dmlink(q->tail, desc);
|
||||
q->tail = desc;
|
||||
|
||||
// FARF(ERROR, "dma-push: i %u width %u nrows %d dst %p src %p\n", q->push_idx, width, nrows, dptr.dst, dptr.src);
|
||||
// FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
|
||||
q->push_idx = (q->push_idx + 1) & q->idx_mask;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
|
||||
dma_ptr dptr,
|
||||
size_t dst_row_size,
|
||||
size_t src_row_size,
|
||||
size_t nrows) {
|
||||
return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
|
||||
}
|
||||
|
||||
|
||||
static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
|
||||
dma_ptr dptr,
|
||||
size_t dst_row_size,
|
||||
size_t src_row_size,
|
||||
size_t nrows) {
|
||||
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
|
||||
}
|
||||
|
||||
static inline dma_ptr dma_queue_pop(dma_queue * q) {
|
||||
dma_ptr dptr = { NULL };
|
||||
|
||||
|
|
@ -131,12 +212,12 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
|
|||
return dptr;
|
||||
}
|
||||
|
||||
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
|
||||
dma_descriptor_2d * desc = &q->desc[q->pop_idx];
|
||||
|
||||
// Wait for desc to complete
|
||||
while (1) {
|
||||
dmpoll();
|
||||
if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
|
||||
if (desc->done) {
|
||||
break;
|
||||
}
|
||||
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
|
||||
|
|
@ -175,6 +256,62 @@ static inline uint32_t dma_queue_capacity(dma_queue * q) {
|
|||
return q->capacity;
|
||||
}
|
||||
|
||||
#if __HVX_ARCH__ < 75
|
||||
|
||||
// Overflow-safe DMA push: all 2d descriptor fields (row_size, nrows, src_stride, dst_stride) are 16-bit, max 65535.
|
||||
// This version transparently handles values that exceed the 16-bit limit and submits chained DMA transtions.
|
||||
|
||||
#define DMA_MAX_FIELD_VAL 65535u
|
||||
|
||||
static inline bool dma_queue_push(dma_queue *q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t row_size, size_t nrows) {
|
||||
// Fast path: everything fits in 16 bits
|
||||
if (nrows == 0 || __builtin_expect(
|
||||
row_size <= DMA_MAX_FIELD_VAL &&
|
||||
nrows <= DMA_MAX_FIELD_VAL &&
|
||||
src_stride <= DMA_MAX_FIELD_VAL &&
|
||||
dst_stride <= DMA_MAX_FIELD_VAL, 1)) {
|
||||
return dma_queue_push_single_2d(q, dptr, dst_stride, src_stride, row_size, nrows);
|
||||
}
|
||||
|
||||
// Contiguous block
|
||||
// Use 1d DMA mode which supports sizes up to 24-bits (16MB)
|
||||
if (nrows == 1 || (row_size == src_stride && row_size == dst_stride)) {
|
||||
size_t total = row_size * nrows;
|
||||
return dma_queue_push_single_1d(q, dptr, total);
|
||||
}
|
||||
|
||||
// Stride overflow — fall back to row-by-row.
|
||||
{
|
||||
const uint8_t *src = (const uint8_t *) dptr.src;
|
||||
uint8_t *dst = (uint8_t *) dptr.dst;
|
||||
for (size_t r = 0; r < nrows; ++r) {
|
||||
dma_ptr p = dma_make_ptr(dst + r * dst_stride, src + r * src_stride);
|
||||
if (!dma_queue_push_single_1d(q, p, row_size))
|
||||
return false;
|
||||
if (r + 1 < nrows)
|
||||
dma_queue_pop(q);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
#else // HVX_ARCH >= 75
|
||||
|
||||
static inline bool dma_queue_push(dma_queue *q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t row_size, size_t nrows) {
|
||||
// On v75 and up we always use 2d 24-bit mode
|
||||
return dma_queue_push_single_2d(q, dptr, dst_stride, src_stride, row_size, nrows);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q, dma_ptr dptr, size_t dst_row_size, size_t src_row_size, size_t nrows) {
|
||||
return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
|
||||
}
|
||||
|
||||
static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, dma_ptr dptr, size_t dst_row_size, size_t src_row_size, size_t nrows) {
|
||||
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -21,6 +21,15 @@ static inline void hex_dump_uint8_line(char * pref, const uint8_t * x, uint32_t
|
|||
FARF(HIGH, "%s\n", str);
|
||||
}
|
||||
|
||||
static inline void hex_dump_uint32_line(char * pref, const uint32_t * x, uint32_t n) {
|
||||
char str[1024], *p = str, *p_end = str + sizeof(str);
|
||||
p += snprintf(p, p_end - p, "%s: ", pref);
|
||||
for (int i = 0; i < n; i++) {
|
||||
p += snprintf(p, p_end - p, "%u, ", (unsigned int) x[i]);
|
||||
}
|
||||
FARF(HIGH, "%s\n", str);
|
||||
}
|
||||
|
||||
static inline void hex_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
|
||||
char str[1024], *p = str, *p_end = str + sizeof(str);
|
||||
p += snprintf(p, p_end - p, "%s: ", pref);
|
||||
|
|
|
|||
|
|
@ -29,10 +29,22 @@ static inline uint64_t hex_get_pktcnt() {
|
|||
return pktcnt;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_aligned(void * addr, uint32_t align) {
|
||||
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
||||
return (num + den - 1) / den;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
|
||||
return ((size_t) addr & (align - 1)) == 0;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_up(size_t v, size_t align) {
|
||||
return hmx_ceil_div(v, align) * align;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_down(size_t v, size_t align) {
|
||||
return (v / align) * align;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
||||
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
||||
uint32_t right_off = left_off + n;
|
||||
|
|
@ -43,6 +55,14 @@ static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
|
|||
return m * ((n + m - 1) / m);
|
||||
}
|
||||
|
||||
static inline size_t hex_smin(size_t a, size_t b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
static inline size_t hex_smax(size_t a, size_t b) {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
|
||||
const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
|
||||
Q6_l2fetch_AP((void *) p, control);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,72 @@
|
|||
// HMX operation entry-point declarations.
|
||||
// Ported from htp-ops-lib/include/dsp/ops.h (renamed, benchmark kernels removed). (https://github.com/haozixu/htp-ops-lib)
|
||||
|
||||
#ifndef HMX_OPS_H
|
||||
#define HMX_OPS_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifndef restrict
|
||||
# define restrict __restrict
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct htp_context; // forward declaration
|
||||
|
||||
typedef struct {
|
||||
float *dst;
|
||||
const float *activation;
|
||||
const __fp16 *permuted_weight;
|
||||
int m;
|
||||
int k;
|
||||
int n;
|
||||
int act_stride;
|
||||
int weight_stride;
|
||||
int dst_stride;
|
||||
int ne02;
|
||||
int ne03;
|
||||
int ne12;
|
||||
int ne13;
|
||||
size_t src0_nb2;
|
||||
size_t src0_nb3;
|
||||
size_t src1_nb2;
|
||||
size_t src1_nb3;
|
||||
size_t dst_nb2;
|
||||
size_t dst_nb3;
|
||||
} hmx_matmul_w16a32_batched_params_t;
|
||||
|
||||
// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
|
||||
// act_stride: activation row stride in elements (= k for contiguous, or
|
||||
// nb[1]/sizeof(float) for permuted tensors like attention Q).
|
||||
// weight_stride: weight row stride in elements (= k for compact weights, or
|
||||
// nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
|
||||
int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const __fp16 *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int act_stride,
|
||||
int weight_stride);
|
||||
|
||||
// Batched F16 wrapper over hmx_mat_mul_permuted_w16a32.
|
||||
// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
|
||||
int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx,
|
||||
const hmx_matmul_w16a32_batched_params_t *params);
|
||||
|
||||
// HMX matrix multiplication — tile-permuted quantised weights (Q4_0/Q8_0/IQ4_NL)
|
||||
int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const uint8_t *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int weight_type);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HMX_OPS_H
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
// Conditional fine-grained profiling macros for HMX operations.
|
||||
//
|
||||
// Define ENABLE_PROFILE_TIMERS (via compiler flag or before including this
|
||||
// header) to instrument sub-operation latencies with HAP qtimer. When the
|
||||
// macro is not defined the TIMER_* helpers expand to nothing so there is zero
|
||||
// overhead.
|
||||
//
|
||||
// Usage:
|
||||
// TIMER_DEFINE(my_phase); // declare accumulator variable
|
||||
// TIMER_START(my_phase); // snapshot start time
|
||||
// ... work ...
|
||||
// TIMER_STOP(my_phase); // accumulate elapsed ticks
|
||||
// FARF(ALWAYS, "my_phase: %lld us", TIMER_US(my_phase));
|
||||
|
||||
#ifndef HMX_PROFILE_H
|
||||
#define HMX_PROFILE_H
|
||||
|
||||
#include <HAP_perf.h>
|
||||
|
||||
// #define ENABLE_PROFILE_TIMERS
|
||||
|
||||
#if defined(ENABLE_PROFILE_TIMERS)
|
||||
# define TIMER_DEFINE(name) int64_t name##_ticks = 0
|
||||
# define TIMER_START(name) int64_t name##_t0 = HAP_perf_get_qtimer_count()
|
||||
# define TIMER_STOP(name) name##_ticks += HAP_perf_get_qtimer_count() - name##_t0
|
||||
# define TIMER_US(name) HAP_perf_qtimer_count_to_us(name##_ticks)
|
||||
#else
|
||||
# define TIMER_DEFINE(name)
|
||||
# define TIMER_START(name)
|
||||
# define TIMER_STOP(name)
|
||||
# define TIMER_US(name) 0LL
|
||||
#endif
|
||||
|
||||
#endif // HMX_PROFILE_H
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
// HMX tile-level inline helpers (FP16 32x32 tile operations).
|
||||
// Ported from htp-ops-lib/include/dsp/hmx_utils.h. (https://github.com/haozixu/htp-ops-lib)
|
||||
|
||||
#ifndef HMX_UTILS_H
|
||||
#define HMX_UTILS_H
|
||||
|
||||
#include <hexagon_types.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#define HMX_FP16_TILE_N_ROWS 32
|
||||
#define HMX_FP16_TILE_N_COLS 32
|
||||
#define HMX_FP16_TILE_N_ELMS 1024
|
||||
#define HMX_FP16_TILE_SIZE 2048
|
||||
|
||||
#define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline))
|
||||
|
||||
static HMX_INLINE_ALWAYS void hmx_set_output_scales(const void *scales) {
|
||||
asm volatile("bias = mxmem2(%0)" :: "r"(scales));
|
||||
}
|
||||
|
||||
// Initialise aligned 256-byte area with scale vector + zero padding.
|
||||
static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) {
|
||||
HVX_Vector *pv = (HVX_Vector *)out_scales;
|
||||
*pv++ = v_scale;
|
||||
*pv = Q6_V_vzero();
|
||||
}
|
||||
|
||||
// Load multiple contiguous tiles with :deep streaming.
|
||||
// Rt = total region size - 1; the hardware streams through [Rs, Rs + Rt].
|
||||
// IMPORTANT: the tile region [Rs, Rs + Rt] must NOT cross a VTCM 4 MB bank
|
||||
// boundary, otherwise the mxmem instruction will raise a precise bus error.
|
||||
// Callers must ensure their VTCM layout satisfies this constraint.
|
||||
static HMX_INLINE_ALWAYS void hmx_load_tiles_fp16(const __fp16 *row_tiles,
|
||||
const __fp16 *col_tiles,
|
||||
size_t n_tiles) {
|
||||
size_t limit = n_tiles * HMX_FP16_TILE_SIZE - 1;
|
||||
asm volatile(
|
||||
"{ activation.hf = mxmem(%0, %1):deep\n"
|
||||
"weight.hf = mxmem(%2, %3) }\n"
|
||||
:: "r"(row_tiles), "r"(limit), "r"(col_tiles), "r"(limit)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
// Load a single activation+weight tile pair (no :deep streaming).
|
||||
// Rt defines the accessible region [Rs, Rs+Rt]. Following the reference formula
|
||||
// (limit = n_tiles * HMX_FP16_TILE_SIZE - 1), for a single tile Rt = 2047.
|
||||
// The original code used Rt=0x7FFF (32 KB region); when dynamic VTCM allocation
|
||||
// places a tile near a 4 MB bank boundary, the oversized region crosses it and
|
||||
// triggers a precise bus error (0x2601). Rt=2047 confines accesses to exactly
|
||||
// one 2048-byte tile while covering all 16 HVX vectors (offsets 0..2047).
|
||||
static HMX_INLINE_ALWAYS void hmx_load_tile_pair_fp16(const __fp16 *act_tile,
|
||||
const __fp16 *wt_tile) {
|
||||
asm volatile(
|
||||
"{ activation.hf = mxmem(%0, %1)\n"
|
||||
"weight.hf = mxmem(%2, %3) }\n"
|
||||
:: "r"(act_tile), "r"(2047),
|
||||
"r"(wt_tile), "r"(2047)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static HMX_INLINE_ALWAYS void hmx_consume_accumulator_fp16(__fp16 *out) {
|
||||
// Use the combined convert-and-store instruction (matches the reference
|
||||
// Q6_mxmem_AR_after_hf intrinsic). The previous two-instruction sequence
|
||||
// "cvt.hf = acc(2); mxmem = cvt" used an undocumented Rs=2 parameter.
|
||||
asm volatile(
|
||||
"mxmem(%0, %1):after.hf = acc\n"
|
||||
:: "r"(out), "r"(0)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
// Compute inner product of two vectors of tiles and store result.
|
||||
static HMX_INLINE_ALWAYS void hmx_dot_fp16(__fp16 *out,
|
||||
const __fp16 *row_tiles,
|
||||
const __fp16 *col_tiles,
|
||||
size_t n_tiles) {
|
||||
hmx_load_tiles_fp16(row_tiles, col_tiles, n_tiles);
|
||||
hmx_consume_accumulator_fp16(out);
|
||||
}
|
||||
|
||||
// --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) ---
|
||||
|
||||
static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) {
|
||||
uint8_t *p = *vtcm_ptr;
|
||||
*vtcm_ptr += size;
|
||||
return p;
|
||||
}
|
||||
|
||||
#endif // HMX_UTILS_H
|
||||
|
|
@ -30,6 +30,18 @@ struct htp_context {
|
|||
atomic_bool vtcm_needs_release;
|
||||
|
||||
uint32_t opmask;
|
||||
|
||||
// Cached src1 spad position from the last quantize pass.
|
||||
// When SKIP_QUANTIZE is set the Q8 activation data is already in VTCM
|
||||
// at this address; the matmul must read from here instead of recomputing
|
||||
// the offset (which depends on the current op's src0 size).
|
||||
uint8_t * prev_src1_spad;
|
||||
|
||||
// HMX acceleration fields (v73+, enabled by compile-time HTP_HAS_HMX)
|
||||
#ifdef HTP_HAS_HMX
|
||||
int hmx_enabled; // Runtime flag: HMX initialisation succeeded
|
||||
size_t vtcm_scratch_size; // Usable dynamic scratch (vtcm_size minus tail reservation)
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif /* HTP_CTX_H */
|
||||
|
|
|
|||
|
|
@ -32,13 +32,14 @@ enum htp_status {
|
|||
// Duplicated here because we can't include full ggml.h in the htp build.
|
||||
// We have some static_asserts in the cpp code to ensure things are in sync.
|
||||
enum htp_data_type {
|
||||
HTP_TYPE_F32 = 0,
|
||||
HTP_TYPE_F16 = 1,
|
||||
HTP_TYPE_Q4_0 = 2,
|
||||
HTP_TYPE_Q8_0 = 8,
|
||||
HTP_TYPE_I32 = 26,
|
||||
HTP_TYPE_I64 = 27,
|
||||
HTP_TYPE_MXFP4 = 39,
|
||||
HTP_TYPE_F32 = 0,
|
||||
HTP_TYPE_F16 = 1,
|
||||
HTP_TYPE_Q4_0 = 2,
|
||||
HTP_TYPE_Q8_0 = 8,
|
||||
HTP_TYPE_IQ4_NL = 20,
|
||||
HTP_TYPE_I32 = 26,
|
||||
HTP_TYPE_I64 = 27,
|
||||
HTP_TYPE_MXFP4 = 39,
|
||||
HTP_TYPE_COUNT
|
||||
};
|
||||
|
||||
|
|
@ -53,6 +54,10 @@ enum htp_op {
|
|||
HTP_OP_RMS_NORM,
|
||||
HTP_OP_UNARY_SILU,
|
||||
HTP_OP_UNARY_GELU,
|
||||
HTP_OP_UNARY_SIGMOID,
|
||||
HTP_OP_UNARY_EXP,
|
||||
HTP_OP_UNARY_NEG,
|
||||
HTP_OP_UNARY_SOFTPLUS,
|
||||
HTP_OP_GLU_SWIGLU,
|
||||
HTP_OP_GLU_SWIGLU_OAI,
|
||||
HTP_OP_GLU_GEGLU,
|
||||
|
|
@ -69,6 +74,7 @@ enum htp_op {
|
|||
HTP_OP_SQRT,
|
||||
HTP_OP_SUM_ROWS,
|
||||
HTP_OP_SSM_CONV,
|
||||
HTP_OP_REPEAT,
|
||||
INVALID
|
||||
};
|
||||
|
||||
|
|
@ -82,6 +88,8 @@ static inline size_t htp_t_block_size(uint32_t t) {
|
|||
return QK4_0;
|
||||
case HTP_TYPE_Q8_0:
|
||||
return QK8_0;
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
return QK4_NL;
|
||||
case HTP_TYPE_MXFP4:
|
||||
return QK_MXFP4;
|
||||
default:
|
||||
|
|
@ -100,6 +108,8 @@ static inline size_t htp_type_nbytes(uint32_t t) {
|
|||
return sizeof(block_q4_0);
|
||||
case HTP_TYPE_Q8_0:
|
||||
return sizeof(block_q8_0);
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
return sizeof(block_iq4_nl);
|
||||
case HTP_TYPE_MXFP4:
|
||||
return sizeof(block_mxfp4);
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ int op_flash_attn_ext(struct htp_ops_context * octx);
|
|||
int op_set_rows(struct htp_ops_context * octx);
|
||||
int op_get_rows(struct htp_ops_context * octx);
|
||||
int op_cpy(struct htp_ops_context * octx);
|
||||
int op_repeat(struct htp_ops_context * octx);
|
||||
int op_argsort(struct htp_ops_context * octx);
|
||||
int op_ssm_conv(struct htp_ops_context * octx);
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@
|
|||
#include "remote.idl"
|
||||
|
||||
interface htp_iface : remote_handle64 {
|
||||
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx);
|
||||
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
|
||||
AEEResult stop();
|
||||
AEEResult enable_etm();
|
||||
AEEResult disable_etm();
|
||||
|
|
|
|||
|
|
@ -3,10 +3,15 @@
|
|||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "hex-utils.h"
|
||||
#include "hvx-types.h"
|
||||
|
||||
#define hvx_vmem(A) *((HVX_Vector *)(A))
|
||||
#define hvx_vmemu(A) *((HVX_UVector *)(A))
|
||||
|
||||
static inline void hvx_vec_store_u(void * restrict dst, uint32_t n, HVX_Vector v) {
|
||||
// Rotate as needed.
|
||||
v = Q6_V_vlalign_VVR(v, v, (size_t) dst);
|
||||
|
|
@ -110,11 +115,15 @@ static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) {
|
|||
return Q6_Q_and_QQ(p_exp, p_frac);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_f32_to_f16(HVX_Vector v0, HVX_Vector v1) {
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
static inline HVX_Vector hvx_vec_f32_to_f16_shuff(HVX_Vector v0, HVX_Vector v1) {
|
||||
const HVX_Vector zero = Q6_V_vzero();
|
||||
HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero);
|
||||
HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero);
|
||||
HVX_Vector v = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0)));
|
||||
return Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0));
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_f32_to_f16(HVX_Vector v0, HVX_Vector v1) {
|
||||
HVX_Vector v = Q6_Vh_vdeal_Vh(hvx_vec_f32_to_f16_shuff(v0, v1));
|
||||
|
||||
#if __HVX_ARCH__ < 79
|
||||
// replace NaNs with -INF, older arches produce NaNs for (-INF + 0.0)
|
||||
|
|
@ -126,6 +135,30 @@ static inline HVX_Vector hvx_vec_f32_to_f16(HVX_Vector v0, HVX_Vector v1) {
|
|||
return v;
|
||||
}
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
static inline HVX_VectorPair hvx_vec_f16_to_f32_shuff(HVX_Vector v) {
|
||||
const HVX_Vector one = hvx_vec_splat_f16(1.0);
|
||||
HVX_VectorPair p = Q6_Wsf_vmpy_VhfVhf(v, one);
|
||||
return Q6_W_vcombine_VV(Q6_V_hi_W(p), Q6_V_lo_W(p));
|
||||
}
|
||||
static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) {
|
||||
const HVX_Vector one = hvx_vec_splat_f16(1.0);
|
||||
HVX_VectorPair p = Q6_Wsf_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(v), one);
|
||||
return Q6_W_vcombine_VV(Q6_V_hi_W(p), Q6_V_lo_W(p));
|
||||
}
|
||||
#else
|
||||
static inline HVX_VectorPair hvx_vec_f16_to_f32_shuff(HVX_Vector v) {
|
||||
const HVX_Vector one = hvx_vec_splat_f16(1.0);
|
||||
HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(v, one);
|
||||
return Q6_W_vcombine_VV(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p)), Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p)));
|
||||
}
|
||||
static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) {
|
||||
const HVX_Vector one = hvx_vec_splat_f16(1.0);
|
||||
HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(v), one);
|
||||
return Q6_W_vcombine_VV(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p)), Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p)));
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Q6_Vsf_equals_Vw is only available on v73+.*/
|
||||
#if __HVX_ARCH__ < 73
|
||||
static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "hvx-base.h"
|
||||
#include "hvx-floor.h"
|
||||
|
|
@ -16,8 +17,8 @@
|
|||
#define EXP_LOGN2 (0x3F317218) // ln(2) = 0.6931471805
|
||||
#define EXP_LOG2E (0x3FB8AA3B) // log2(e) = 1/ln(2) = 1.4426950408
|
||||
#define EXP_ONE (0x3f800000) // 1.0
|
||||
#define EXP_RANGE_R (0x41a00000) // 20.0
|
||||
#define EXP_RANGE_L (0xc1a00000) // -20.0
|
||||
#define EXP_RANGE_R (0x42B16666) // 88.7
|
||||
#define EXP_RANGE_L (0xC2B00000) // -88.0 (approx log(FLT_MIN))
|
||||
|
||||
static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
|
||||
HVX_Vector z_qf32_v;
|
||||
|
|
@ -47,12 +48,12 @@ static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
|
|||
|
||||
HVX_Vector temp_v = in_vec;
|
||||
|
||||
// Clamp inputs to (-20.0, 20.0)
|
||||
// Clamp inputs to (-88.0, 88.0) to avoid overflow/underflow
|
||||
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
|
||||
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
|
||||
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v);
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
|
||||
|
||||
epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
|
||||
epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
|
||||
|
|
@ -69,12 +70,12 @@ static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
|
|||
// normalize before every QFloat's vmpy
|
||||
x_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
|
||||
|
||||
x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
|
||||
|
||||
// z = x * x;
|
||||
z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
|
||||
z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
|
||||
|
||||
x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
|
||||
|
||||
// y = E4 + E5 * x;
|
||||
E_const = Q6_V_vsplat_R(EXP_COEFF_5);
|
||||
y_v = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
|
||||
|
|
@ -145,7 +146,7 @@ static inline HVX_Vector hvx_vec_exp_f32_guard(HVX_Vector in_vec, HVX_Vector max
|
|||
return Q6_V_vmux_QVV(pred0, inf, out);
|
||||
}
|
||||
|
||||
static inline void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
|
||||
static inline void hvx_exp_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int num_elems, bool negate) {
|
||||
int left_over = num_elems & (VLEN_FP32 - 1);
|
||||
int num_elems_whole = num_elems - left_over;
|
||||
|
||||
|
|
@ -162,7 +163,7 @@ static inline void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict
|
|||
HVX_Vector vec_out = Q6_V_vzero();
|
||||
|
||||
static const float kInf = INFINITY;
|
||||
static const float kMaxExp = 88.02f; // log(INF)
|
||||
static const float kMaxExp = 88.7f;
|
||||
|
||||
const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp);
|
||||
const HVX_Vector inf = hvx_vec_splat_f32(kInf);
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
#define HVX_SIGMOID_H
|
||||
|
||||
#include "hvx-base.h"
|
||||
#include "hvx-inverse.h"
|
||||
|
||||
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
|
||||
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
|
||||
|
|
|
|||
|
|
@ -15,12 +15,4 @@
|
|||
#include "hvx-div.h"
|
||||
#include "hvx-base.h"
|
||||
|
||||
#ifndef GATHER_TYPE
|
||||
# if defined(__hexagon__)
|
||||
# define GATHER_TYPE(_a) (intptr_t) _a
|
||||
# else
|
||||
# define GATHER_TYPE(_a) (HVX_Vector *) _a
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif /* HVX_UTILS_H */
|
||||
|
|
|
|||
|
|
@ -25,6 +25,10 @@
|
|||
#include "htp-ops.h"
|
||||
#include "worker-pool.h"
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
#include "hmx-ops.h"
|
||||
#endif // HTP_HAS_HMX
|
||||
|
||||
AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
struct htp_context * ctx;
|
||||
int err = 0;
|
||||
|
|
@ -163,6 +167,9 @@ static int vtcm_acquire(struct htp_context * ctx) {
|
|||
}
|
||||
|
||||
ctx->vtcm_inuse = true;
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -207,7 +214,7 @@ static int vtcm_alloc(struct htp_context * ctx) {
|
|||
HAP_compute_res_attr_init(&attr);
|
||||
HAP_compute_res_attr_set_serialize(&attr, 0);
|
||||
HAP_compute_res_attr_set_cache_mode(&attr, 1);
|
||||
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
|
||||
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size); // single page
|
||||
HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
|
||||
HAP_compute_res_attr_set_hmx_param(&attr, 1);
|
||||
|
||||
|
|
@ -246,7 +253,7 @@ static void vtcm_free(struct htp_context * ctx) {
|
|||
static void htp_packet_callback(dspqueue_t queue, int error, void * context);
|
||||
static void htp_error_callback(dspqueue_t queue, int error, void * context);
|
||||
|
||||
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
|
||||
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx) {
|
||||
struct htp_context * ctx = (struct htp_context *) handle;
|
||||
|
||||
if (!ctx) {
|
||||
|
|
@ -280,6 +287,21 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
|||
return AEE_ENOMEMORY;
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
if (use_hmx) {
|
||||
ctx->vtcm_scratch_size = ctx->vtcm_size;
|
||||
ctx->hmx_enabled = 1;
|
||||
|
||||
FARF(HIGH, "HMX enabled: vtcm-scratch %zu", ctx->vtcm_scratch_size);
|
||||
} else {
|
||||
// HMX disabled: skip HMX initialisation so the
|
||||
// dispatch loop falls through to the HVX compute paths.
|
||||
ctx->hmx_enabled = 0;
|
||||
ctx->vtcm_scratch_size = ctx->vtcm_size;
|
||||
FARF(HIGH, "HMX disabled (use_hmx=0): vtcm-scratch %zu", ctx->vtcm_scratch_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
qurt_sysenv_max_hthreads_t hw_threads;
|
||||
qurt_sysenv_get_max_hw_threads(&hw_threads);
|
||||
uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
|
||||
|
|
@ -297,7 +319,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
|||
ctx->n_threads = n_hvx;
|
||||
for (int i = 0; i < ctx->n_threads; i++) {
|
||||
// see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
|
||||
ctx->dma[i] = dma_queue_create(64);
|
||||
ctx->dma[i] = dma_queue_create(128);
|
||||
}
|
||||
|
||||
// init worker pool
|
||||
|
|
@ -340,6 +362,12 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
|
|||
for (int i = 0; i < ctx->n_threads; i++) {
|
||||
dma_queue_delete(ctx->dma[i]);
|
||||
}
|
||||
#ifdef HTP_HAS_HMX
|
||||
if (ctx->hmx_enabled) {
|
||||
ctx->hmx_enabled = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
vtcm_free(ctx);
|
||||
|
||||
|
|
@ -375,8 +403,9 @@ static int send_htp_rsp(struct htp_context * c,
|
|||
struct dspqueue_buffer * bufs,
|
||||
size_t n_bufs,
|
||||
struct profile_data * prof) {
|
||||
// Prep response struct
|
||||
// Prep response struct (zero-init to clear cmp/unused union)
|
||||
struct htp_general_rsp rsp;
|
||||
memset(&rsp, 0, sizeof(rsp));
|
||||
rsp.op = op;
|
||||
rsp.status = status;
|
||||
rsp.prof_usecs = prof->usecs;
|
||||
|
|
@ -516,6 +545,39 @@ static void proc_cpy_req(struct htp_context * ctx, struct htp_general_req * req,
|
|||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||
}
|
||||
|
||||
static void proc_repeat_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
||||
struct dspqueue_buffer rsp_bufs[1];
|
||||
|
||||
// We had written to the output buffer, we'd also need to flush it
|
||||
rsp_bufs[0].fd = bufs[1].fd;
|
||||
rsp_bufs[0].ptr = bufs[1].ptr;
|
||||
rsp_bufs[0].offset = bufs[1].offset;
|
||||
rsp_bufs[0].size = bufs[1].size;
|
||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||
|
||||
// Setup Op context
|
||||
struct htp_ops_context octx = { 0 };
|
||||
octx.ctx = ctx;
|
||||
octx.src0 = req->src0;
|
||||
octx.dst = req->dst;
|
||||
octx.flags = req->flags;
|
||||
octx.op = req->op;
|
||||
|
||||
// Update data pointers
|
||||
octx.src0.data = (uint32_t) bufs[0].ptr;
|
||||
octx.dst.data = (uint32_t) bufs[1].ptr;
|
||||
octx.n_threads = ctx->n_threads;
|
||||
|
||||
struct profile_data prof;
|
||||
profile_start(&prof);
|
||||
|
||||
uint32_t rsp_status = op_repeat(&octx);
|
||||
|
||||
profile_stop(&prof);
|
||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||
}
|
||||
|
||||
static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
||||
struct dspqueue_buffer rsp_bufs[1];
|
||||
|
||||
|
|
@ -1004,6 +1066,208 @@ static void proc_flash_attn_ext_req(struct htp_context * ctx,
|
|||
send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof);
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
// ---------------------------------------------------------------------------
|
||||
// HMX operation wrappers — self-contained, bypass htp_ops_context / htp_spad.
|
||||
// VTCM, DMA and thread dispatch are managed inside the HMX kernels.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void proc_hmx_matmul_req(struct htp_context * ctx,
|
||||
struct htp_general_req * req,
|
||||
struct dspqueue_buffer * bufs,
|
||||
size_t n_bufs) {
|
||||
// HMX weight tile requires N to be 32-aligned.
|
||||
if (req->src0.ne[1] % 32 != 0) {
|
||||
proc_matmul_req(ctx, req, bufs, n_bufs);
|
||||
return;
|
||||
}
|
||||
|
||||
const bool is_batched = (req->src0.ne[2] * req->src0.ne[3] > 1 ||
|
||||
req->src1.ne[2] * req->src1.ne[3] > 1);
|
||||
|
||||
// Quantised HMX kernels only handle flat 2D matmul (host already rejects
|
||||
// batched quantised, but guard here too). F16 batched matmul is handled
|
||||
// by the dedicated wrapper in hmx-matmul-ops.c.
|
||||
if (is_batched &&
|
||||
req->src0.type != HTP_TYPE_F16) {
|
||||
proc_matmul_req(ctx, req, bufs, n_bufs);
|
||||
return;
|
||||
}
|
||||
|
||||
// HMX assumes contiguous row-major layout. Fall back for permuted
|
||||
// tensors where strides are non-monotonic (e.g. transposed KV cache).
|
||||
if (req->src0.nb[0] > req->src0.nb[1] ||
|
||||
req->src1.nb[0] > req->src1.nb[1]) {
|
||||
proc_matmul_req(ctx, req, bufs, n_bufs);
|
||||
return;
|
||||
}
|
||||
|
||||
// M alignment: when M > 32 but not 32-aligned, we split into
|
||||
// HMX (first m_hmx = M & ~31 rows) + HVX (remaining m_tail rows).
|
||||
// When M <= 32 and not 32-aligned, fall back entirely to HVX.
|
||||
const int m_total = (int) req->src1.ne[1];
|
||||
const int m_tail = m_total % 32;
|
||||
const int m_hmx = m_total - m_tail;
|
||||
|
||||
if (m_hmx == 0) {
|
||||
proc_matmul_req(ctx, req, bufs, n_bufs);
|
||||
return;
|
||||
}
|
||||
|
||||
// HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights.
|
||||
// Other types fall back to HVX.
|
||||
{
|
||||
uint32_t wtype = req->src0.type;
|
||||
if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL &&
|
||||
wtype != HTP_TYPE_MXFP4) {
|
||||
proc_matmul_req(ctx, req, bufs, n_bufs);
|
||||
return;
|
||||
}
|
||||
// Quantised HMX path requires K aligned to 256 (x4x2 super-block).
|
||||
// F16 HMX path requires K aligned to 32 (tile width).
|
||||
if (wtype != HTP_TYPE_F16 && req->src0.ne[0] % 256 != 0) {
|
||||
proc_matmul_req(ctx, req, bufs, n_bufs);
|
||||
return;
|
||||
}
|
||||
if (wtype == HTP_TYPE_F16 && req->src0.ne[0] % 32 != 0) {
|
||||
proc_matmul_req(ctx, req, bufs, n_bufs);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
(void) n_bufs;
|
||||
|
||||
struct dspqueue_buffer rsp_bufs[1];
|
||||
rsp_bufs[0].fd = bufs[2].fd;
|
||||
rsp_bufs[0].ptr = bufs[2].ptr;
|
||||
rsp_bufs[0].size = bufs[2].size;
|
||||
rsp_bufs[0].offset = bufs[2].offset;
|
||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);
|
||||
|
||||
// src0 = weights, src1 = activation, dst = output
|
||||
void * wgt = (void *) bufs[0].ptr;
|
||||
float * act = (float *) bufs[1].ptr;
|
||||
float * dst = (float *) bufs[2].ptr;
|
||||
|
||||
int k = (int) req->src0.ne[0]; // inner dimension
|
||||
int n = (int) req->src0.ne[1]; // weight columns
|
||||
|
||||
|
||||
struct profile_data prof;
|
||||
profile_start(&prof);
|
||||
|
||||
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||
|
||||
// --- Phase 1: HMX on the first m_hmx (32-aligned) rows ---
|
||||
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
||||
int ret = -1;
|
||||
|
||||
const int ne02 = (int) req->src0.ne[2];
|
||||
const int ne03 = (int) req->src0.ne[3];
|
||||
const int ne12 = (int) req->src1.ne[2];
|
||||
const int ne13 = (int) req->src1.ne[3];
|
||||
// Row strides in elements. For compact tensors these equal k; for
|
||||
// permuted attention views they can be larger, so pass the real stride.
|
||||
const int act_stride = (int)(req->src1.nb[1] / sizeof(float));
|
||||
const int weight_stride = (int)(req->src0.nb[1] / sizeof(__fp16));
|
||||
|
||||
switch (req->src0.type) {
|
||||
case HTP_TYPE_F16:
|
||||
if (is_batched) {
|
||||
hmx_matmul_w16a32_batched_params_t batch_params = {
|
||||
.dst = dst,
|
||||
.activation = act,
|
||||
.permuted_weight = (const __fp16 *) wgt,
|
||||
.m = m_hmx,
|
||||
.k = k,
|
||||
.n = n,
|
||||
.act_stride = act_stride,
|
||||
.weight_stride = weight_stride,
|
||||
.dst_stride = (int)(req->dst.nb[1] / sizeof(float)),
|
||||
.ne02 = ne02,
|
||||
.ne03 = ne03,
|
||||
.ne12 = ne12,
|
||||
.ne13 = ne13,
|
||||
.src0_nb2 = req->src0.nb[2],
|
||||
.src0_nb3 = req->src0.nb[3],
|
||||
.src1_nb2 = req->src1.nb[2],
|
||||
.src1_nb3 = req->src1.nb[3],
|
||||
.dst_nb2 = req->dst.nb[2],
|
||||
.dst_nb3 = req->dst.nb[3],
|
||||
};
|
||||
ret = hmx_mat_mul_permuted_w16a32_batched(ctx, &batch_params);
|
||||
} else {
|
||||
ret = hmx_mat_mul_permuted_w16a32(ctx, dst, act,
|
||||
(const __fp16 *) wgt,
|
||||
m_hmx, k, n,
|
||||
act_stride,
|
||||
weight_stride);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
ret = hmx_mat_mul_permuted_qk_0_d16a32(ctx, dst, act,
|
||||
(const uint8_t *) wgt,
|
||||
m_hmx, k, n, (int) req->src0.type);
|
||||
break;
|
||||
}
|
||||
|
||||
if (ret == 0) {
|
||||
rsp_status = HTP_STATUS_OK;
|
||||
} else {
|
||||
FARF(HIGH, "HMX matmul failed (ret=%d), falling back to HVX", ret);
|
||||
vtcm_release(ctx);
|
||||
req->flags &= ~HTP_OPFLAGS_SKIP_QUANTIZE;
|
||||
proc_matmul_req(ctx, req, bufs, n_bufs);
|
||||
return;
|
||||
}
|
||||
vtcm_release(ctx);
|
||||
}
|
||||
|
||||
// --- Phase 2: HVX on the remaining m_tail rows ---
|
||||
if (m_tail > 0 && rsp_status == HTP_STATUS_OK) {
|
||||
struct htp_ops_context octx = { 0 };
|
||||
octx.ctx = ctx;
|
||||
octx.src0 = req->src0; // weights: unchanged
|
||||
octx.src1 = req->src1;
|
||||
octx.src1.ne[1] = m_tail; // only tail rows
|
||||
octx.dst = req->dst;
|
||||
octx.dst.ne[1] = m_tail; // only tail rows
|
||||
// Always re-quantize tail src1: HMX Phase 1 overwrites VTCM,
|
||||
// so any previously cached quantized data (SKIP_QUANTIZE pipeline)
|
||||
// is invalid.
|
||||
octx.flags = req->flags & ~HTP_OPFLAGS_SKIP_QUANTIZE;
|
||||
octx.op = req->op;
|
||||
octx.n_threads = ctx->n_threads;
|
||||
|
||||
// Offset activation and dst pointers past the HMX-processed rows.
|
||||
// Use nb[1] (row stride in bytes) to compute the byte offset.
|
||||
octx.src0.data = (uint32_t) bufs[0].ptr;
|
||||
octx.src1.data = (uint32_t)((uint8_t *) bufs[1].ptr + (size_t) m_hmx * req->src1.nb[1]);
|
||||
octx.dst.data = (uint32_t)((uint8_t *) bufs[2].ptr + (size_t) m_hmx * req->dst.nb[1]);
|
||||
|
||||
FARF(HIGH, "proc_hmx_matmul: HVX tail m_tail=%d act=%p dst=%p",
|
||||
m_tail, (void *)(uintptr_t) octx.src1.data, (void *)(uintptr_t) octx.dst.data);
|
||||
|
||||
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
||||
uint32_t hvx_ret = op_matmul(&octx);
|
||||
vtcm_release(ctx);
|
||||
if (hvx_ret != HTP_STATUS_OK) {
|
||||
FARF(ERROR, "HVX tail matmul failed (ret=%u)", hvx_ret);
|
||||
rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||
}
|
||||
} else {
|
||||
rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||
}
|
||||
}
|
||||
|
||||
profile_stop(&prof);
|
||||
|
||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||
}
|
||||
|
||||
#endif // HTP_HAS_HMX
|
||||
|
||||
static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
struct htp_context * ctx = (struct htp_context *) context;
|
||||
|
||||
|
|
@ -1056,7 +1320,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
|||
FARF(ERROR, "Bad matmul-req buffer list");
|
||||
continue;
|
||||
}
|
||||
proc_matmul_req(ctx, &req, bufs, n_bufs);
|
||||
#ifdef HTP_HAS_HMX
|
||||
if (ctx->hmx_enabled) {
|
||||
proc_hmx_matmul_req(ctx, &req, bufs, n_bufs);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
proc_matmul_req(ctx, &req, bufs, n_bufs);
|
||||
}
|
||||
break;
|
||||
|
||||
case HTP_OP_MUL_MAT_ID:
|
||||
|
|
@ -1090,6 +1361,10 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
|||
|
||||
case HTP_OP_SQR:
|
||||
case HTP_OP_SQRT:
|
||||
case HTP_OP_UNARY_NEG:
|
||||
case HTP_OP_UNARY_EXP:
|
||||
case HTP_OP_UNARY_SIGMOID:
|
||||
case HTP_OP_UNARY_SOFTPLUS:
|
||||
if (n_bufs != 2) {
|
||||
FARF(ERROR, "Bad unary-req buffer list");
|
||||
continue;
|
||||
|
|
@ -1175,6 +1450,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
|||
proc_cpy_req(ctx, &req, bufs);
|
||||
break;
|
||||
|
||||
case HTP_OP_REPEAT:
|
||||
if (n_bufs != 2) {
|
||||
FARF(ERROR, "Bad repeat-req buffer list");
|
||||
continue;
|
||||
}
|
||||
proc_repeat_req(ctx, &req, bufs);
|
||||
break;
|
||||
|
||||
case HTP_OP_ARGSORT:
|
||||
if (n_bufs != 2) {
|
||||
FARF(ERROR, "Bad argsort-req buffer list");
|
||||
|
|
|
|||
|
|
@ -60,6 +60,16 @@ static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = {
|
|||
0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x22, 0x20, 0x24, 0x20, 0x21, 0x22, 0x20, 0x20,
|
||||
};
|
||||
|
||||
// IQ4_NL dequantization LUT: maps 4-bit index (0-15) to int8 kvalue
|
||||
// kvalues: -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113
|
||||
static const uint8_t __attribute__((aligned(VLEN))) kvalues_iq4nl_lut[] = {
|
||||
0x81, 0, 0x98, 0, 0xAD, 0, 0xBF, 0, 0xCF, 0, 0xDD, 0, 0xEA, 0, 0xF6, 0, 0x01, 0, 0x0D, 0, 0x19, 0, 0x26, 0,
|
||||
0x35, 0, 0x45, 0, 0x59, 0, 0x71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = {
|
||||
0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 6, 0, 8, 0, 12, 0, 0, 0, 0xff, 0, 0xfe, 0, 0xfd, 0, 0xfc, 0,
|
||||
0xfa, 0, 0xf8, 0, 0xf4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
|
@ -68,6 +78,73 @@ static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = {
|
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_full(const uint8_t * restrict ptr) {
|
||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||
|
||||
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
|
||||
HVX_Vector v2_3 = vptr[1]; // ...
|
||||
HVX_Vector v4_5 = vptr[2]; // ...
|
||||
HVX_Vector v6_7 = vptr[3]; // ...
|
||||
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut;
|
||||
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4
|
||||
HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F
|
||||
HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
|
||||
HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
|
||||
HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
|
||||
HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F
|
||||
HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4
|
||||
|
||||
v0 = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
|
||||
v1 = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
|
||||
v2 = Q6_Vb_vlut32_VbVbI(v2, lut, 0);
|
||||
v3 = Q6_Vb_vlut32_VbVbI(v3, lut, 0);
|
||||
v4 = Q6_Vb_vlut32_VbVbI(v4, lut, 0);
|
||||
v5 = Q6_Vb_vlut32_VbVbI(v5, lut, 0);
|
||||
v6 = Q6_Vb_vlut32_VbVbI(v6, lut, 0);
|
||||
v7 = Q6_Vb_vlut32_VbVbI(v7, lut, 0);
|
||||
|
||||
HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
|
||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2; // 256
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut;
|
||||
|
||||
HVX_Vector_x8 r;
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (i = 0; i < nb; i++) {
|
||||
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
|
||||
r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
|
||||
r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
|
||||
HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
|
||||
r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0);
|
||||
r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// q4x4x2 and q8x4x2 are the flat q4/8_0 formats where all quants are stored first followed by all scales
|
||||
|
||||
static inline size_t q8x4x2_row_size(uint32_t ne) {
|
||||
|
|
@ -921,6 +998,293 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1
|
||||
}
|
||||
|
||||
// ======== IQ4_NL x Q8_0 vec_dot kernels ========
|
||||
// Same structure as Q4_0 vec_dot but uses IQ4_NL LUT-based load (4-bit index -> int8 kvalue).
|
||||
// Scale format is identical to Q4_0 (fp16 scales).
|
||||
|
||||
static void vec_dot_iq4nlx4x2_q8x4x2_1x1(const int n,
|
||||
float * restrict s0,
|
||||
const void * restrict vx0,
|
||||
const void * restrict vy0) {
|
||||
assert(n % 32 == 0);
|
||||
assert((unsigned long) vx0 % 128 == 0);
|
||||
assert((unsigned long) vy0 % 128 == 0);
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2 * 4;
|
||||
|
||||
const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t x_qblk_size = qk / 2; // int4
|
||||
const uint32_t x_qrow_size = n / 2; // int4 (not padded)
|
||||
|
||||
const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t y_qblk_size = qk; // int8
|
||||
const uint32_t y_qrow_size = n; // int8 (not padded)
|
||||
|
||||
const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first
|
||||
const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
|
||||
|
||||
const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
|
||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
|
||||
|
||||
hvx_vec_store_u(s0, 4, r0_sum);
|
||||
}
|
||||
|
||||
static void vec_dot_iq4nlx4x2_q8x4x2_2x1(const int n,
|
||||
float * restrict s0,
|
||||
const void * restrict vx0,
|
||||
const void * restrict vx1,
|
||||
const void * restrict vy0) {
|
||||
assert(n % 32 == 0);
|
||||
assert((unsigned long) vx0 % 128 == 0);
|
||||
assert((unsigned long) vx1 % 128 == 0);
|
||||
assert((unsigned long) vy0 % 128 == 0);
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2 * 4;
|
||||
|
||||
const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t x_qblk_size = qk / 2; // int4
|
||||
const uint32_t x_qrow_size = n / 2; // int4 (not padded)
|
||||
|
||||
const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t y_qblk_size = qk; // int8
|
||||
const uint32_t y_qrow_size = n; // int8 (not padded)
|
||||
|
||||
const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
|
||||
const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
|
||||
const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
|
||||
const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
|
||||
|
||||
const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
|
||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_sum = Q6_V_vzero();
|
||||
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
||||
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
|
||||
hvx_vec_store_u(s0, 8, rsum);
|
||||
}
|
||||
|
||||
static void vec_dot_iq4nlx4x2_q8x4x2_2x2(const int n,
|
||||
float * restrict s0,
|
||||
float * restrict s1,
|
||||
const void * restrict vx0,
|
||||
const void * restrict vx1,
|
||||
const void * restrict vy0,
|
||||
const void * restrict vy1) {
|
||||
assert(n % 32 == 0);
|
||||
assert((unsigned long) vx0 % 128 == 0);
|
||||
assert((unsigned long) vx1 % 128 == 0);
|
||||
assert((unsigned long) vy0 % 128 == 0);
|
||||
assert((unsigned long) vy1 % 128 == 0);
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2 * 4;
|
||||
|
||||
const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t x_qblk_size = qk / 2; // int4
|
||||
const uint32_t x_qrow_size = n / 2; // int4 (not padded)
|
||||
|
||||
const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t y_qblk_size = qk; // int8
|
||||
const uint32_t y_qrow_size = n; // int8 (not padded)
|
||||
|
||||
const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;
|
||||
const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;
|
||||
const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;
|
||||
const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;
|
||||
|
||||
const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0;
|
||||
const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;
|
||||
const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0;
|
||||
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;
|
||||
|
||||
HVX_Vector r0_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r0_c1_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c1_sum = Q6_V_vzero();
|
||||
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
|
||||
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
|
||||
HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
|
||||
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
|
||||
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
|
||||
|
||||
HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
|
||||
HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
|
||||
HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
|
||||
HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
|
||||
|
||||
r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
|
||||
r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
|
||||
r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
|
||||
r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
|
||||
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
|
||||
HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
|
||||
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
|
||||
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
|
||||
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
|
||||
r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
|
||||
r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
|
||||
r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
|
||||
r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
|
||||
r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
|
||||
r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
|
||||
r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
|
||||
|
||||
HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
|
||||
HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
|
||||
HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
|
||||
HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
|
||||
|
||||
r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
|
||||
r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
|
||||
r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
|
||||
r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
|
||||
}
|
||||
|
||||
HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
|
||||
HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
|
||||
|
||||
hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum);
|
||||
hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum);
|
||||
}
|
||||
|
||||
static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
|
||||
assert(n % 32 == 0); // min sub-block size
|
||||
assert((unsigned long) vx0 % 128 == 0);
|
||||
|
|
@ -2393,6 +2757,12 @@ static int htp_mminit_vec_dot(struct htp_matmul_context * mmctx, enum htp_data_t
|
|||
mmctx->vec_dot_2x1 = vec_dot_q8x4x2_q8x4x2_2x1;
|
||||
mmctx->vec_dot_2x2 = vec_dot_q8x4x2_q8x4x2_2x2;
|
||||
return 0;
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
mmctx->type = "iq4nlx4x2-f32";
|
||||
mmctx->vec_dot_1x1 = vec_dot_iq4nlx4x2_q8x4x2_1x1;
|
||||
mmctx->vec_dot_2x1 = vec_dot_iq4nlx4x2_q8x4x2_2x1;
|
||||
mmctx->vec_dot_2x2 = vec_dot_iq4nlx4x2_q8x4x2_2x2;
|
||||
return 0;
|
||||
case HTP_TYPE_MXFP4:
|
||||
mmctx->type = "mxfp4x4x2-f32";
|
||||
mmctx->vec_dot_1x1 = vec_dot_mxfp4x4x2_q8x4x2_1x1;
|
||||
|
|
@ -2556,6 +2926,13 @@ int op_matmul(struct htp_ops_context * octx) {
|
|||
const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
|
||||
mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
|
||||
worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
|
||||
// Cache where src1 was written so subsequent SKIP_QUANTIZE ops can find it
|
||||
octx->ctx->prev_src1_spad = octx->src1_spad.data;
|
||||
} else {
|
||||
// SKIP_QUANTIZE: Q8 data lives at the address written by the previous
|
||||
// quantize pass. The current op may have a different src0 size (e.g.
|
||||
// IQ4_NL vs MXFP4), so src1_spad.data computed above could be wrong.
|
||||
octx->src1_spad.data = octx->ctx->prev_src1_spad;
|
||||
}
|
||||
|
||||
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
||||
|
|
@ -2659,6 +3036,9 @@ int op_matmul_id(struct htp_ops_context * octx) {
|
|||
const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
|
||||
mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
|
||||
worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
|
||||
octx->ctx->prev_src1_spad = octx->src1_spad.data;
|
||||
} else {
|
||||
octx->src1_spad.data = octx->ctx->prev_src1_spad;
|
||||
}
|
||||
|
||||
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,148 @@
|
|||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "hvx-utils.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-msg.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
struct htp_repeat_context {
|
||||
struct htp_ops_context * octx;
|
||||
|
||||
uint32_t nr0;
|
||||
uint32_t nr1;
|
||||
uint32_t nr2;
|
||||
uint32_t nr3;
|
||||
|
||||
uint32_t nrows_per_thread;
|
||||
uint32_t total_dst_rows; // ne1 * ne2 * ne3
|
||||
|
||||
size_t type_size;
|
||||
};
|
||||
|
||||
static void repeat_job_per_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_repeat_context * rctx = (const struct htp_repeat_context *) data;
|
||||
struct htp_ops_context * octx = rctx->octx;
|
||||
const struct htp_tensor * src = &octx->src0;
|
||||
const struct htp_tensor * dst = &octx->dst;
|
||||
|
||||
const uint32_t ne00 = src->ne[0];
|
||||
const uint32_t ne01 = src->ne[1];
|
||||
const uint32_t ne02 = src->ne[2];
|
||||
const uint32_t ne03 = src->ne[3];
|
||||
|
||||
const uint32_t nb00 = src->nb[0];
|
||||
const uint32_t nb01 = src->nb[1];
|
||||
const uint32_t nb02 = src->nb[2];
|
||||
const uint32_t nb03 = src->nb[3];
|
||||
|
||||
const uint32_t ne0 = dst->ne[0];
|
||||
const uint32_t ne1 = dst->ne[1];
|
||||
const uint32_t ne2 = dst->ne[2];
|
||||
const uint32_t ne3 = dst->ne[3];
|
||||
|
||||
const uint32_t nb0 = dst->nb[0];
|
||||
const uint32_t nb1 = dst->nb[1];
|
||||
const uint32_t nb2 = dst->nb[2];
|
||||
const uint32_t nb3 = dst->nb[3];
|
||||
|
||||
const uint32_t nr0 = rctx->nr0;
|
||||
const uint32_t nr1 = rctx->nr1;
|
||||
const uint32_t nr2 = rctx->nr2;
|
||||
const uint32_t nr3 = rctx->nr3;
|
||||
|
||||
const size_t row_bytes = ne00 * rctx->type_size;
|
||||
|
||||
const uint32_t row_start = rctx->nrows_per_thread * ith;
|
||||
const uint32_t row_end = MIN(row_start + rctx->nrows_per_thread, rctx->total_dst_rows);
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) {
|
||||
// Decompose flat dst row index into (i1, i2, i3)
|
||||
const uint32_t i1 = dst_row % ne1;
|
||||
const uint32_t i2 = (dst_row / ne1) % ne2;
|
||||
const uint32_t i3 = dst_row / (ne1 * ne2);
|
||||
|
||||
// Map to source indices (tiling)
|
||||
const uint32_t k1 = i1 % ne01;
|
||||
const uint32_t k2 = i2 % ne02;
|
||||
const uint32_t k3 = i3 % ne03;
|
||||
|
||||
const uint8_t * src_row = (const uint8_t *) src->data + k1 * nb01 + k2 * nb02 + k3 * nb03;
|
||||
uint8_t * dst_base = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
|
||||
|
||||
// Tile along dimension 0
|
||||
for (uint32_t i0 = 0; i0 < nr0; i0++) {
|
||||
uint8_t * dst_ptr = dst_base + i0 * ne00 * nb0;
|
||||
memcpy(dst_ptr, src_row, row_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "repeat %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
|
||||
ith, nth, src->ne[0], src->ne[1], src->ne[2], src->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
row_start, row_end, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
int op_repeat(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * src0 = &octx->src0;
|
||||
struct htp_tensor * dst = &octx->dst;
|
||||
|
||||
// Validate that dst dims are multiples of src dims
|
||||
if (dst->ne[0] % src0->ne[0] != 0 ||
|
||||
dst->ne[1] % src0->ne[1] != 0 ||
|
||||
dst->ne[2] % src0->ne[2] != 0 ||
|
||||
dst->ne[3] % src0->ne[3] != 0) {
|
||||
FARF(ERROR, "repeat: dst dims must be multiples of src dims\n");
|
||||
return HTP_STATUS_INVAL_PARAMS;
|
||||
}
|
||||
|
||||
size_t type_size;
|
||||
switch (src0->type) {
|
||||
case HTP_TYPE_F32: type_size = 4; break;
|
||||
case HTP_TYPE_F16: type_size = 2; break;
|
||||
default:
|
||||
FARF(ERROR, "repeat: unsupported type %u\n", src0->type);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
const uint32_t total_dst_rows = dst->ne[1] * dst->ne[2] * dst->ne[3];
|
||||
const uint32_t n_threads = MIN(octx->n_threads, total_dst_rows);
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
struct htp_repeat_context rctx = {
|
||||
.octx = octx,
|
||||
.nr0 = dst->ne[0] / src0->ne[0],
|
||||
.nr1 = dst->ne[1] / src0->ne[1],
|
||||
.nr2 = dst->ne[2] / src0->ne[2],
|
||||
.nr3 = dst->ne[3] / src0->ne[3],
|
||||
.nrows_per_thread = (total_dst_rows + n_threads - 1) / n_threads,
|
||||
.total_dst_rows = total_dst_rows,
|
||||
.type_size = type_size,
|
||||
};
|
||||
|
||||
FARF(HIGH, "repeat: (%ux%ux%ux%u) -> (%ux%ux%ux%u) nr=(%u,%u,%u,%u)\n",
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
rctx.nr0, rctx.nr1, rctx.nr2, rctx.nr3);
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, repeat_job_per_thread, &rctx, n_threads);
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
|
@ -195,7 +195,7 @@ static float hvx_softmax_f32(const uint8_t * restrict src,
|
|||
const float max) {
|
||||
hvx_sub_scalar_f32(spad, src, max, num_elems);
|
||||
|
||||
hvx_exp_f32(spad, dst, num_elems, false);
|
||||
hvx_exp_f32(dst, spad, num_elems, false);
|
||||
|
||||
float sum = hvx_reduce_sum_f32(dst, num_elems);
|
||||
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
|
|||
const int dr = scctx->nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = MIN(ir0 + dr, d_inner);
|
||||
const int ir = ir1 - ir0;
|
||||
const uint32_t ir = ir1 - ir0;
|
||||
|
||||
if (ir0 >= ir1) {
|
||||
return; // No work for this thread
|
||||
|
|
@ -205,10 +205,10 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
|
|||
HVX_Vector acc_vec = Q6_V_vsplat_R(0);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
|
||||
Q6_vgather_ARMVw(src0_vec, GATHER_TYPE(spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0])),
|
||||
src0_gather_len, (*(const HVX_Vector *) src0_offsets));
|
||||
Q6_vgather_ARMVw(src1_vec, GATHER_TYPE(spad_src1 + (i0 + i1 * nc) * sizeof(float)),
|
||||
src1_gather_len, (*(const HVX_Vector *) src1_offsets));
|
||||
uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
|
||||
uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc) * sizeof(float);
|
||||
Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
|
||||
Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
|
||||
|
||||
HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
|
||||
acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);
|
||||
|
|
@ -222,10 +222,10 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
|
|||
HVX_Vector acc_vec = Q6_V_vsplat_R(0);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
|
||||
Q6_vgather_ARMVw(src0_vec, GATHER_TYPE(spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0])),
|
||||
src0_gather_len, (*(const HVX_Vector *) src0_offsets));
|
||||
Q6_vgather_ARMVw(src1_vec, GATHER_TYPE(spad_src1 + (i0 + i1 * nc) * sizeof(float)),
|
||||
src1_gather_len, (*(const HVX_Vector *) src1_offsets));
|
||||
uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
|
||||
uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc) * sizeof(float);
|
||||
Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
|
||||
Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
|
||||
|
||||
HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
|
||||
acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);
|
||||
|
|
|
|||
|
|
@ -9,6 +9,8 @@
|
|||
#include <string.h>
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hvx-exp.h"
|
||||
#include "hvx-sigmoid.h"
|
||||
#include "hvx-utils.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
|
|
@ -166,6 +168,75 @@ static void sqrt_f32(const float * restrict src,
|
|||
}
|
||||
}
|
||||
|
||||
static void neg_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params) {
|
||||
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
|
||||
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
|
||||
|
||||
hvx_scale_f32_aa(dst_local, src_local, row_elems, -1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
static void exp_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params) {
|
||||
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
|
||||
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
|
||||
|
||||
hvx_exp_f32(dst_local, src_local, row_elems, false);
|
||||
}
|
||||
}
|
||||
|
||||
static void sigmoid_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params) {
|
||||
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
|
||||
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
|
||||
|
||||
hvx_sigmoid_f32_aa(dst_local, src_local, row_elems);
|
||||
}
|
||||
}
|
||||
|
||||
static void softplus_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params) {
|
||||
// softplus(x) = log(1 + exp(x))
|
||||
// Match CPU reference: ggml_compute_softplus_f32() in ggml-impl.h
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const float * restrict src_f = (const float *)((const uint8_t *)src + (ir * row_size));
|
||||
float * restrict dst_f = (float *)((uint8_t *)dst + (ir * row_size));
|
||||
|
||||
for (uint32_t i = 0; i < row_elems; i++) {
|
||||
float x = src_f[i];
|
||||
// For x > 20: softplus(x) ≈ x (avoids exp overflow)
|
||||
dst_f[i] = (x > 20.0f) ? x : logf(1.0f + expf(x));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
|
||||
struct htp_ops_context * octx = uctx->octx;
|
||||
|
|
@ -247,6 +318,18 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
|||
case HTP_OP_SQRT:
|
||||
sqrt_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_UNARY_NEG:
|
||||
neg_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_UNARY_EXP:
|
||||
exp_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_UNARY_SIGMOID:
|
||||
sigmoid_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_UNARY_SOFTPLUS:
|
||||
softplus_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -295,6 +378,18 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
|||
case HTP_OP_SQRT:
|
||||
op_type = "sqrt-f32";
|
||||
break;
|
||||
case HTP_OP_UNARY_NEG:
|
||||
op_type = "neg-f32";
|
||||
break;
|
||||
case HTP_OP_UNARY_EXP:
|
||||
op_type = "exp-f32";
|
||||
break;
|
||||
case HTP_OP_UNARY_SIGMOID:
|
||||
op_type = "sigmoid-f32";
|
||||
break;
|
||||
case HTP_OP_UNARY_SOFTPLUS:
|
||||
op_type = "softplus-f32";
|
||||
break;
|
||||
|
||||
default:
|
||||
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
||||
|
|
|
|||
|
|
@ -53,9 +53,6 @@ endif()
|
|||
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
|
||||
# Workaround old compilers
|
||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --gpu-max-threads-per-block=1024")
|
||||
|
||||
file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
|
||||
list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
|
||||
|
||||
|
|
@ -74,12 +71,11 @@ if (GGML_CUDA_FA_ALL_QUANTS)
|
|||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||
else()
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
list(APPEND GGML_SOURCES_ROCM
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu)
|
||||
endif()
|
||||
|
||||
ggml_add_backend_library(ggml-hip
|
||||
|
|
@ -132,6 +128,11 @@ endif()
|
|||
|
||||
if (CXX_IS_HIPCC)
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||
if (WIN32 AND CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
# CMake on Windows doesn't support the HIP language yet.
|
||||
# Therefore we workaround debug build's failure on HIP backend this way.
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES COMPILE_FLAGS "-O2 -g")
|
||||
endif()
|
||||
target_link_libraries(ggml-hip PRIVATE hip::device)
|
||||
else()
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
|
||||
|
|
|
|||
|
|
@ -773,6 +773,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph * cgraph,
|
|||
|
||||
// expose GGUF internals for test code
|
||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
|
||||
#endif // __cplusplus
|
||||
|
|
|
|||
|
|
@ -246,6 +246,10 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary(ggml_metal
|
|||
case GGML_UNARY_OP_EXP: op_num = OP_UNARY_NUM_EXP; break;
|
||||
case GGML_UNARY_OP_SOFTPLUS: op_num = OP_UNARY_NUM_SOFTPLUS; break;
|
||||
case GGML_UNARY_OP_EXPM1: op_num = OP_UNARY_NUM_EXPM1; break;
|
||||
case GGML_UNARY_OP_FLOOR: op_num = OP_UNARY_NUM_FLOOR; break;
|
||||
case GGML_UNARY_OP_CEIL: op_num = OP_UNARY_NUM_CEIL; break;
|
||||
case GGML_UNARY_OP_ROUND: op_num = OP_UNARY_NUM_ROUND; break;
|
||||
case GGML_UNARY_OP_TRUNC: op_num = OP_UNARY_NUM_TRUNC; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
} break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
|
|
@ -1748,6 +1752,28 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_2d(ggml_met
|
|||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_3d(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_CONV_3D);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(op->src[0]));
|
||||
GGML_ASSERT(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(op->type == GGML_TYPE_F32);
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
snprintf(base, 256, "kernel_conv_3d_%s_%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type));
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_UPSCALE);
|
||||
|
||||
|
|
|
|||
|
|
@ -148,6 +148,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col
|
|||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_3d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
|
|
|
|||
|
|
@ -690,7 +690,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
|||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" matmul2d_descriptor(16, 16, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
|
|
@ -740,7 +740,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
|||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" matmul2d_descriptor(16, 16, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
|
|
@ -1039,6 +1039,10 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
case GGML_UNARY_OP_FLOOR:
|
||||
case GGML_UNARY_OP_CEIL:
|
||||
case GGML_UNARY_OP_ROUND:
|
||||
case GGML_UNARY_OP_TRUNC:
|
||||
return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
||||
default:
|
||||
return false;
|
||||
|
|
@ -1077,6 +1081,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|||
(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) &&
|
||||
op->src[1]->type == GGML_TYPE_F32 &&
|
||||
op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_3D:
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op->src[1]) &&
|
||||
(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) &&
|
||||
op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SUM:
|
||||
return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_TRI:
|
||||
|
|
@ -1143,6 +1152,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|||
op->src[0]->ne[0] != 192 &&
|
||||
op->src[0]->ne[0] != 256 &&
|
||||
op->src[0]->ne[0] != 320 &&
|
||||
op->src[0]->ne[0] != 512 &&
|
||||
op->src[0]->ne[0] != 576) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -120,6 +120,10 @@
|
|||
#define OP_UNARY_NUM_EXP 114
|
||||
#define OP_UNARY_NUM_SOFTPLUS 115
|
||||
#define OP_UNARY_NUM_EXPM1 116
|
||||
#define OP_UNARY_NUM_FLOOR 117
|
||||
#define OP_UNARY_NUM_CEIL 118
|
||||
#define OP_UNARY_NUM_ROUND 119
|
||||
#define OP_UNARY_NUM_TRUNC 120
|
||||
|
||||
#define OP_SUM_ROWS_NUM_SUM_ROWS 10
|
||||
#define OP_SUM_ROWS_NUM_MEAN 11
|
||||
|
|
@ -643,6 +647,42 @@ typedef struct {
|
|||
int32_t KHW; // KH * KW, pre-computed on CPU to save GPU resources
|
||||
} ggml_metal_kargs_im2col;
|
||||
|
||||
typedef struct {
|
||||
int32_t IW;
|
||||
int32_t IH;
|
||||
int32_t ID;
|
||||
int32_t OW;
|
||||
int32_t OH;
|
||||
int32_t OD;
|
||||
int32_t KW;
|
||||
int32_t KH;
|
||||
int32_t KD;
|
||||
int32_t s0;
|
||||
int32_t s1;
|
||||
int32_t s2;
|
||||
int32_t p0;
|
||||
int32_t p1;
|
||||
int32_t p2;
|
||||
int32_t d0;
|
||||
int32_t d1;
|
||||
int32_t d2;
|
||||
int32_t IC;
|
||||
int32_t N;
|
||||
int32_t OC;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
} ggml_metal_kargs_conv_3d;
|
||||
|
||||
typedef struct{
|
||||
int32_t ne00;
|
||||
uint64_t nb01;
|
||||
|
|
|
|||
|
|
@ -394,6 +394,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
|||
{
|
||||
n_fuse = ggml_metal_op_conv_transpose_2d(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_CONV_3D:
|
||||
{
|
||||
n_fuse = ggml_metal_op_conv_3d(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_UPSCALE:
|
||||
{
|
||||
n_fuse = ggml_metal_op_upscale(ctx, idx);
|
||||
|
|
@ -3697,6 +3701,77 @@ int ggml_metal_op_conv_2d(ggml_metal_op_t ctx, int idx) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_conv_3d(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
ggml_metal_library_t lib = ctx->lib;
|
||||
ggml_metal_encoder_t enc = ctx->enc;
|
||||
|
||||
// 1. Extract standard dimensions and byte strides
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||
|
||||
// 2. Extract hyperparams from op_params
|
||||
const int32_t s0 = ((const int32_t *)(op->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t *)(op->op_params))[1];
|
||||
const int32_t s2 = ((const int32_t *)(op->op_params))[2];
|
||||
const int32_t p0 = ((const int32_t *)(op->op_params))[3];
|
||||
const int32_t p1 = ((const int32_t *)(op->op_params))[4];
|
||||
const int32_t p2 = ((const int32_t *)(op->op_params))[5];
|
||||
const int32_t d0 = ((const int32_t *)(op->op_params))[6];
|
||||
const int32_t d1 = ((const int32_t *)(op->op_params))[7];
|
||||
const int32_t d2 = ((const int32_t *)(op->op_params))[8];
|
||||
const int32_t IC = ((const int32_t *)(op->op_params))[9];
|
||||
const int32_t N = ((const int32_t *)(op->op_params))[10];
|
||||
const int32_t OC = ((const int32_t *)(op->op_params))[11];
|
||||
|
||||
// 3. Build the parameter struct using the macro-generated variables
|
||||
ggml_metal_kargs_conv_3d args = {
|
||||
/*.IW =*/ (int32_t)op->src[1]->ne[0],
|
||||
/*.IH =*/ (int32_t)op->src[1]->ne[1],
|
||||
/*.ID =*/ (int32_t)op->src[1]->ne[2],
|
||||
/*.OW =*/ (int32_t)op->ne[0],
|
||||
/*.OH =*/ (int32_t)op->ne[1],
|
||||
/*.OD =*/ (int32_t)op->ne[2],
|
||||
/*.KW =*/ (int32_t)op->src[0]->ne[0],
|
||||
/*.KH =*/ (int32_t)op->src[0]->ne[1],
|
||||
/*.KD =*/ (int32_t)op->src[0]->ne[2],
|
||||
s0, s1, s2,
|
||||
p0, p1, p2,
|
||||
d0, d1, d2,
|
||||
IC, N, OC,
|
||||
nb00, nb01, nb02, nb03, // Weight strides
|
||||
nb10, nb11, nb12, nb13, // Input strides
|
||||
nb0, nb1, nb2, nb3 // Output strides
|
||||
};
|
||||
|
||||
// 4. Fetch the JIT pipeline
|
||||
auto pipeline = ggml_metal_library_get_pipeline_conv_3d(lib, op);
|
||||
|
||||
// 5. Grid mapping
|
||||
int nth0 = 32; // Standard SIMD width for Apple Silicon
|
||||
int nth1 = 1;
|
||||
int nth2 = 1;
|
||||
|
||||
int64_t spatial_volume = args.OW * args.OH * args.OD;
|
||||
|
||||
int ntg0 = (spatial_volume + nth0 - 1) / nth0;
|
||||
int ntg1 = args.OC;
|
||||
int ntg2 = args.N;
|
||||
|
||||
// 6. Bind and Dispatch via the ggml C wrapper
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ntg0, ntg1, ntg2, nth0, nth1, nth2);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
|
|
|
|||
|
|
@ -75,6 +75,7 @@ int ggml_metal_op_norm (ggml_metal_op_t ctx, int idx);
|
|||
int ggml_metal_op_rope (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_im2col (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_conv_2d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_conv_3d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_conv_transpose_1d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_upscale (ggml_metal_op_t ctx, int idx);
|
||||
|
|
|
|||
|
|
@ -1094,6 +1094,22 @@ kernel void kernel_unary_impl(
|
|||
// TODO: precise implementation
|
||||
dst_ptr[i0] = (T) (exp(x) - 1);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_FLOOR) {
|
||||
dst_ptr[i0] = (T) floor(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_CEIL) {
|
||||
dst_ptr[i0] = (T) ceil(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_ROUND) {
|
||||
dst_ptr[i0] = (T) round(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_TRUNC) {
|
||||
dst_ptr[i0] = (T) trunc(x);
|
||||
}
|
||||
}
|
||||
|
||||
#undef FC_OP
|
||||
|
|
@ -4883,6 +4899,98 @@ kernel void kernel_upscale_bilinear_f32(
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_conv_3d(
|
||||
constant ggml_metal_kargs_conv_3d & args,
|
||||
device const char * src0, // Weights [IC * OC, KD, KH, KW]
|
||||
device const char * src1, // Inputs [IC * N, ID, IH, IW]
|
||||
device char * dst, // Outputs [OC * N, OD, OH, OW]
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]]) {
|
||||
|
||||
// 1. Un-flatten the spatial dimension from Grid X
|
||||
int64_t spatial_idx = tgpig.x * 32 + tpitg.x;
|
||||
|
||||
if (spatial_idx >= args.OW * args.OH * args.OD) {
|
||||
return; // Thread falls outside the spatial volume
|
||||
}
|
||||
|
||||
int64_t od = spatial_idx / (args.OW * args.OH);
|
||||
int64_t oh = (spatial_idx / args.OW) % args.OH;
|
||||
int64_t ow = spatial_idx % args.OW;
|
||||
|
||||
// 2. Map Y to Channels, Z to Batch
|
||||
int64_t oc = tgpig.y;
|
||||
int64_t batch_idx = tgpig.z;
|
||||
|
||||
// 3. Calculate anchor coordinates in the Input volume
|
||||
int64_t i_w_base = ow * args.s0 - args.p0;
|
||||
int64_t i_h_base = oh * args.s1 - args.p1;
|
||||
int64_t i_d_base = od * args.s2 - args.p2;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
// 4. Gather Loop (Iterate over Input Channels -> Depth -> Height -> Width)
|
||||
for (int64_t ic = 0; ic < args.IC; ++ic) {
|
||||
|
||||
// ggml packs batch and channel together in the 4th dimension
|
||||
int64_t src_cn_idx = batch_idx * args.IC + ic;
|
||||
int64_t w_cn_idx = oc * args.IC + ic;
|
||||
|
||||
for (int64_t kz = 0; kz < args.KD; ++kz) {
|
||||
int64_t id = i_d_base + kz * args.d2;
|
||||
if (id < 0 || id >= args.ID) continue; // Boundary check (Padding)
|
||||
|
||||
for (int64_t ky = 0; ky < args.KH; ++ky) {
|
||||
int64_t ih = i_h_base + ky * args.d1;
|
||||
if (ih < 0 || ih >= args.IH) continue;
|
||||
|
||||
for (int64_t kx = 0; kx < args.KW; ++kx) {
|
||||
int64_t iw = i_w_base + kx * args.d0;
|
||||
if (iw < 0 || iw >= args.IW) continue;
|
||||
|
||||
// Convert multi-dimensional coordinates to flat byte offsets
|
||||
int64_t w_idx = kx*args.nb00 + ky*args.nb01 + kz*args.nb02 + w_cn_idx*args.nb03;
|
||||
int64_t i_idx = iw*args.nb10 + ih*args.nb11 + id*args.nb12 + src_cn_idx*args.nb13;
|
||||
|
||||
// Dereference memory and cast weights to f32 if they were f16
|
||||
float w_val = (float)*(device const T*)((device const char*)src0 + w_idx);
|
||||
float i_val = *(device const float*)((device const char*)src1 + i_idx);
|
||||
|
||||
sum += w_val * i_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Write the accumulated value out to RAM
|
||||
int64_t dst_cn_idx = batch_idx * args.OC + oc;
|
||||
int64_t d_idx = ow*args.nb0 + oh*args.nb1 + od*args.nb2 + dst_cn_idx*args.nb3;
|
||||
|
||||
*(device float*)(dst + d_idx) = sum;
|
||||
}
|
||||
|
||||
// Explicit instantiations so the JIT compiler can find them by name
|
||||
template [[host_name("kernel_conv_3d_f32_f32")]]
|
||||
kernel void kernel_conv_3d<float>(
|
||||
constant ggml_metal_kargs_conv_3d & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]]);
|
||||
|
||||
// Explicit instantiation for f16 weights
|
||||
template [[host_name("kernel_conv_3d_f16_f32")]]
|
||||
kernel void kernel_conv_3d<half>(
|
||||
constant ggml_metal_kargs_conv_3d & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]]);
|
||||
|
||||
|
||||
static inline float bicubic_weight1(float x) {
|
||||
const float a = -0.75f;
|
||||
return ((a + 2) * x - (a + 3)) * x * x + 1;
|
||||
|
|
@ -6177,6 +6285,7 @@ template [[host_name("kernel_flash_attn_ext_f32_dk192_dv192")]] kernel flash_at
|
|||
template [[host_name("kernel_flash_attn_ext_f32_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f32_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f32_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f32_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 512, 512>;
|
||||
template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 32, 32>;
|
||||
|
|
@ -6192,6 +6301,7 @@ template [[host_name("kernel_flash_attn_ext_f16_dk192_dv192")]] kernel flash_at
|
|||
template [[host_name("kernel_flash_attn_ext_f16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 512, 512>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 576, 512>;
|
||||
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
|
|
@ -6208,6 +6318,7 @@ template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv192")]] kernel flash_at
|
|||
template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 512, 512>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 576, 512>;
|
||||
#endif
|
||||
|
||||
|
|
@ -6224,6 +6335,7 @@ template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv192")]] kernel flash_at
|
|||
template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 512, 512>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 32, 32>;
|
||||
|
|
@ -6239,6 +6351,7 @@ template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv192")]] kernel flash_at
|
|||
template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 512, 512>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 32, 32>;
|
||||
|
|
@ -6254,6 +6367,7 @@ template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv192")]] kernel flash_at
|
|||
template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 512, 512>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 32, 32>;
|
||||
|
|
@ -6269,6 +6383,7 @@ template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv192")]] kernel flash_at
|
|||
template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 512, 512>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 32, 32>;
|
||||
|
|
@ -6284,6 +6399,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv192")]] kernel flash_at
|
|||
template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 512, 512>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 576, 512>;
|
||||
|
||||
#undef FA_TYPES
|
||||
|
|
@ -6865,6 +6981,17 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk320_dv256")]] kernel flas
|
|||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 320, 256, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 320, 256, 2>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f32_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES_F32, float4, 1, dequantize_f32_t4, float4, 1, dequantize_f32_t4, 512, 512, 1>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 512, 512, 1>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 512, 512, 1>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 512, 512, 1>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 512, 512, 1>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 512, 512, 1>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 512, 512, 1>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 512, 512, 1>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f32_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES_F32, float4, 1, dequantize_f32_t4, float4, 1, dequantize_f32_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 576, 512, 2>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
|
|
|
|||
|
|
@ -48,12 +48,11 @@ if (MUSAToolkit_FOUND)
|
|||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||
else()
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
list(APPEND GGML_SOURCES_MUSA
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu)
|
||||
endif()
|
||||
|
||||
set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ set(GGML_OPENCL_KERNELS
|
|||
mul_mv_q4_1_f32
|
||||
mul_mv_q4_1_f32_flat
|
||||
mul_mv_q4_k_f32
|
||||
mul_mv_q4_k_f32_flat
|
||||
mul_mv_q6_k_f32
|
||||
mul_mv_q6_k_f32_flat
|
||||
mul_mv_q8_0_f32
|
||||
|
|
@ -107,11 +108,14 @@ set(GGML_OPENCL_KERNELS
|
|||
mul_mm_q4_0_f32_l4_lm
|
||||
mul_mm_q4_1_f32_l4_lm
|
||||
mul_mm_q8_0_f32_l4_lm
|
||||
mul_mm_q4_k_f32_l4_lm
|
||||
mul_mm_q6_k_f32_l4_lm
|
||||
mul_mm_q8_0_f32_8x4
|
||||
gemv_noshuffle_q4_1_f32
|
||||
gemm_noshuffle_q4_1_f32
|
||||
gemv_noshuffle_general_q8_0_f32
|
||||
gemv_noshuffle_q6_k_f32
|
||||
gemm_noshuffle_q6_k_f32
|
||||
mul
|
||||
neg
|
||||
norm
|
||||
|
|
|
|||
|
|
@ -394,6 +394,9 @@ struct ggml_backend_opencl_context {
|
|||
bool fp16_support;
|
||||
bool has_vector_subgroup_broadcast;
|
||||
bool disable_fusion;
|
||||
|
||||
bool adreno_has_large_buffer;
|
||||
bool adreno_use_large_buffer;
|
||||
ggml_cl_compiler_version adreno_cl_compiler_version;
|
||||
|
||||
int adreno_wave_size;
|
||||
|
|
@ -529,16 +532,19 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
|
||||
cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
|
||||
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
|
||||
cl_kernel kernel_convert_block_q6_K_noshuffle, kernel_restore_block_q6_K_noshuffle;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
||||
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_1_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_1_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
|
||||
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q4_1_f32;
|
||||
cl_kernel kernel_mul_mv_q4_1_f32_flat;
|
||||
cl_kernel kernel_mul_mv_q4_K_f32;
|
||||
cl_kernel kernel_mul_mv_q4_K_f32_flat;
|
||||
cl_kernel kernel_mul_mv_q6_K_f32;
|
||||
cl_kernel kernel_mul_mv_q6_K_f32_flat;
|
||||
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
|
||||
|
|
@ -578,6 +584,7 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_mul_mm_q4_0_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q4_1_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q4_k_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q6_k_f32_l4_lm;
|
||||
|
||||
std::vector<ProfilingInfo> profiling_info;
|
||||
|
|
@ -713,6 +720,8 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_gemm_noshuffle_q4_1_f32;
|
||||
cl_kernel kernel_mul_mm_q8_0_f32_8x4;
|
||||
cl_kernel CL_mul_mat_vec_q8_0_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q6_K_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q6_K_f32;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
void free() {
|
||||
|
|
@ -781,6 +790,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
|
||||
if (backend_ctx->adreno_use_large_buffer) {
|
||||
compile_opts += " -qcom-enable-large-buffer ";
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels");
|
||||
|
||||
// add
|
||||
|
|
@ -917,8 +930,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K_noshuffle", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
|
|
@ -1209,6 +1226,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_q4_k_f32_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_q4_k_f32_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_q4_k_f32_flat.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_q4_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q4_K_f32_flat", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_q6_k_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
|
|
@ -1482,6 +1516,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_q4_k_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mm_q4_k_f32_l4_lm.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mm_q4_k_f32_l4_lm.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mm_q4_k_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q4_k_f32_l4_lm", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_q6_k_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
|
|
@ -2603,6 +2654,45 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemm_moe_mxfp4_f32, "kernel_gemm_moe_mxfp4_f32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_noshuffle_q6_k_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_noshuffle_q6_k_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_noshuffle_q6_k_f32.cl");
|
||||
#endif
|
||||
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable ";
|
||||
if (backend_ctx->has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q6_K_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q6_K_f32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_q6_k_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q6_k_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q6_k_f32.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q6_K_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q6_K_f32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_LOG_CONT("\n");
|
||||
}
|
||||
|
|
@ -2937,6 +3027,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
// Check if ext_buffer contains cl_khr_fp16
|
||||
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
|
||||
// check Adreno large buffer support
|
||||
backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
|
||||
|
||||
// fp16 is required
|
||||
if (!backend_ctx->fp16_support) {
|
||||
|
|
@ -3003,6 +3095,18 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
// determine whether to use large buffer for Adreno
|
||||
backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
|
||||
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
|
||||
if (backend_ctx->adreno_use_large_buffer) {
|
||||
if (!backend_ctx->adreno_has_large_buffer) {
|
||||
GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n");
|
||||
backend_ctx->adreno_use_large_buffer = false;
|
||||
} else {
|
||||
GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n");
|
||||
}
|
||||
}
|
||||
|
||||
cl_int err;
|
||||
|
||||
// A local ref of cl_context for convenience
|
||||
|
|
@ -3347,6 +3451,40 @@ struct ggml_tensor_extra_cl_q8_0 {
|
|||
}
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_cl_q4_K {
|
||||
// Quantized values
|
||||
cl_mem q = nullptr;
|
||||
// Scales for each super block.
|
||||
cl_mem s = nullptr;
|
||||
// Scales
|
||||
cl_mem d = nullptr;
|
||||
// Min
|
||||
cl_mem dm = nullptr;
|
||||
|
||||
~ggml_tensor_extra_cl_q4_K() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
if (q != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(q));
|
||||
q = nullptr;
|
||||
}
|
||||
if (s != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(s));
|
||||
s = nullptr;
|
||||
}
|
||||
if (d != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(d));
|
||||
d = nullptr;
|
||||
}
|
||||
if (dm != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(dm));
|
||||
dm = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_cl_q6_K {
|
||||
// Lower 4 bits of quantized weights.
|
||||
cl_mem ql = nullptr;
|
||||
|
|
@ -3956,6 +4094,12 @@ struct ggml_backend_opencl_buffer_context {
|
|||
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K_in_use) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) {
|
||||
delete e;
|
||||
}
|
||||
|
|
@ -4039,6 +4183,21 @@ struct ggml_backend_opencl_buffer_context {
|
|||
return extra;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl_q4_K * ggml_opencl_alloc_temp_tensor_extra_q4_K() {
|
||||
ggml_tensor_extra_cl_q4_K * extra;
|
||||
if (temp_tensor_extras_q4_K.empty()) {
|
||||
extra = new ggml_tensor_extra_cl_q4_K();
|
||||
} else {
|
||||
extra = temp_tensor_extras_q4_K.back();
|
||||
temp_tensor_extras_q4_K.pop_back();
|
||||
}
|
||||
|
||||
temp_tensor_extras_q4_K_in_use.push_back(extra);
|
||||
|
||||
extra->reset();
|
||||
return extra;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() {
|
||||
ggml_tensor_extra_cl_q6_K * extra;
|
||||
if (temp_tensor_extras_q6_K.empty()) {
|
||||
|
|
@ -4080,6 +4239,11 @@ struct ggml_backend_opencl_buffer_context {
|
|||
}
|
||||
temp_tensor_extras_q8_0_in_use.clear();
|
||||
|
||||
for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K_in_use) {
|
||||
temp_tensor_extras_q4_K.push_back(e);
|
||||
}
|
||||
temp_tensor_extras_q4_K_in_use.clear();
|
||||
|
||||
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
|
||||
temp_tensor_extras_q6_K.push_back(e);
|
||||
}
|
||||
|
|
@ -4101,6 +4265,8 @@ struct ggml_backend_opencl_buffer_context {
|
|||
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
|
||||
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q4_K *> temp_tensor_extras_q4_K;
|
||||
std::vector<ggml_tensor_extra_cl_q4_K *> temp_tensor_extras_q4_K_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K;
|
||||
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K_in_use;
|
||||
|
||||
|
|
@ -4835,6 +5001,83 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q4_K) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
||||
|
||||
// Allocate the new extra and create aliases from the original.
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
ggml_tensor_extra_cl_q4_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_K();
|
||||
|
||||
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
||||
size_t size_dm = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
||||
size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(3 * ggml_blck_size(tensor->type) / 64);
|
||||
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
||||
GGML_ASSERT(size_d + size_dm + size_s + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK(clEnqueueWriteBuffer(
|
||||
queue, data_device, CL_TRUE, 0,
|
||||
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
||||
|
||||
cl_buffer_region region;
|
||||
|
||||
// Create subbuffer for d.
|
||||
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
||||
region.size = size_d;
|
||||
extra->d = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
auto previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for mins.
|
||||
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
|
||||
region.size = size_dm;
|
||||
extra->dm = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for s.
|
||||
region.origin = align_to(previous_origin + size_dm, backend_ctx->alignment);
|
||||
region.size = size_s;
|
||||
extra->s = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for quants.
|
||||
region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
|
||||
region.size = size_q;
|
||||
extra->q = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
||||
|
|
@ -4851,61 +5094,58 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
"Incorrect tensor size");
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK(clEnqueueWriteBuffer(
|
||||
queue, data_device, CL_TRUE, 0,
|
||||
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
||||
cl_mem data_device;
|
||||
CL_CHECK((data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, ggml_nbytes(tensor), NULL, &err), err));
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, data_device, CL_TRUE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL));
|
||||
|
||||
cl_buffer_region region;
|
||||
|
||||
// Subbuffer for ql
|
||||
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
||||
region.size = size_ql;
|
||||
extra->ql = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK((extra->ql = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
auto previous_origin = region.origin;
|
||||
|
||||
// Subbuffer for qh
|
||||
region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment);
|
||||
region.size = size_qh;
|
||||
extra->qh = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Subbuffer for scales
|
||||
region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment);
|
||||
region.size = size_s;
|
||||
extra->s = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK((extra->s = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for d.
|
||||
region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
|
||||
region.size = size_d;
|
||||
extra->d = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK((extra->d = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Flatten the weights
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q6_K;
|
||||
cl_kernel kernel;
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
kernel = backend_ctx->kernel_convert_block_q6_K;
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
kernel = backend_ctx->kernel_convert_block_q6_K_noshuffle;
|
||||
}
|
||||
#else
|
||||
kernel = backend_ctx->kernel_convert_block_q6_K;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
|
||||
cl_uchar mask = 0xff;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &n_blk));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64)*64, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
|
|
@ -4919,6 +5159,29 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
extra->size_d = size_d;
|
||||
|
||||
tensor->extra = extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
cl_int M = tensor->ne[1]; // ne01
|
||||
cl_int K = tensor->ne[0]; // ne00
|
||||
|
||||
// Transpose ql as ushort
|
||||
transpose_2d_as_16b(backend_ctx,
|
||||
extra->ql, extra->ql, size_ql, K/4, M);
|
||||
|
||||
// Transpose qh as uchar
|
||||
transpose_2d_as_8b(backend_ctx,
|
||||
extra->qh, extra->qh, size_qh, K/4, M);
|
||||
|
||||
// Transpose s as ushort
|
||||
transpose_2d_as_16b(backend_ctx,
|
||||
extra->s, extra->s, size_s, K/16/2, M);
|
||||
|
||||
// Transpose d as ushort
|
||||
transpose_2d_as_16b(backend_ctx,
|
||||
extra->d, extra->d, size_d, K/256, M);
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_SOA_Q
|
||||
|
|
@ -5245,24 +5508,111 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
|
||||
if (tensor->type == GGML_TYPE_Q4_K) {
|
||||
ggml_tensor_extra_cl_q4_K * extra = (ggml_tensor_extra_cl_q4_K *)tensor->extra;
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
static ggml_cl_buffer buf_trans_ql;
|
||||
static ggml_cl_buffer buf_trans_qh;
|
||||
static ggml_cl_buffer buf_trans_s;
|
||||
static ggml_cl_buffer buf_trans_d;
|
||||
static ggml_cl_buffer buf_unpacked;
|
||||
|
||||
cl_int M = tensor->ne[1]; // ne01
|
||||
cl_int K = tensor->ne[0]; // ne00
|
||||
|
||||
GGML_ASSERT(K % ggml_blck_size(tensor->type) == 0);
|
||||
|
||||
size_t size_ql = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
||||
size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/4;
|
||||
size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/16;
|
||||
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
||||
GGML_ASSERT(size_ql + size_qh + size_s + size_d == ggml_nbytes(tensor) && "Incorrect tensor size");
|
||||
|
||||
buf_trans_ql.allocate(backend_ctx->context, size_ql);
|
||||
buf_trans_qh.allocate(backend_ctx->context, size_qh);
|
||||
buf_trans_s.allocate(backend_ctx->context, size_s);
|
||||
buf_trans_d.allocate(backend_ctx->context, size_d);
|
||||
buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
|
||||
|
||||
// transpose ql, qh, s and d back
|
||||
transpose_2d_as_16b(backend_ctx, extra->ql, buf_trans_ql.buffer, size_ql, M, K/4);
|
||||
transpose_2d_as_8b(backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/4);
|
||||
transpose_2d_as_16b(backend_ctx, extra->s, buf_trans_s.buffer, size_s, M, K/16/2);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/256);
|
||||
|
||||
// unpack
|
||||
cl_uchar mask = 0xFF;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_ql.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_qh.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_s.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_d.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_unpacked.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &n_blk));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n_blk, 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_uchar mask = 0xFF;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &n_blk));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n_blk, 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
|
@ -5331,6 +5681,11 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
|
|||
|
||||
cl_int err;
|
||||
cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
|
||||
if (err != CL_SUCCESS && backend_ctx->adreno_use_large_buffer) {
|
||||
cl_mem_properties props[] = { 0x41A6 /* CL_LARGE_BUFFER_QCOM */, 1, 0 };
|
||||
mem = clCreateBufferWithProperties(backend_ctx->context, props, CL_MEM_READ_WRITE, size, NULL, &err);
|
||||
}
|
||||
|
||||
if (err != CL_SUCCESS) {
|
||||
GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
|
||||
return nullptr;
|
||||
|
|
@ -5553,6 +5908,8 @@ typedef struct {
|
|||
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2,
|
||||
"wrong q4_0 block size/padding");
|
||||
|
||||
#define QK_MXFP4 32
|
||||
|
||||
#include <math.h>
|
||||
#ifdef __cplusplus
|
||||
#include "half.hpp"
|
||||
|
|
@ -5596,7 +5953,7 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|||
buf_d = malloc(size_e);
|
||||
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_e, buf_d, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, extra->e, CL_TRUE, 0, size_e, buf_d, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(queue));
|
||||
} else {
|
||||
// Read out the tensor from GPU memory.
|
||||
|
|
@ -9331,6 +9688,196 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
|||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
cl_int err;
|
||||
cl_buffer_region region;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
|
||||
// subbuffer and image for activation
|
||||
if (ne1 == 1) {
|
||||
cl_mem ql_img = nullptr;
|
||||
cl_mem qh_img = nullptr;
|
||||
cl_mem b_sub_buffer = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// image for ql
|
||||
img_fmt.image_channel_order = CL_R;
|
||||
img_fmt.image_channel_data_type = CL_FLOAT;
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = ne01 * ne00 / 8;
|
||||
img_desc.buffer = extra0_q6_K->ql;
|
||||
CL_CHECK((ql_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// image for qh
|
||||
img_fmt.image_channel_order = CL_R;
|
||||
img_fmt.image_channel_data_type = CL_HALF_FLOAT;
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = ne01 * ne00 / 8;
|
||||
img_desc.buffer = extra0_q6_K->qh;
|
||||
CL_CHECK((qh_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
region.origin = offset1;
|
||||
region.size = ne00 * ne1 * sizeof(float);
|
||||
CL_CHECK((b_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
img_fmt.image_channel_order = CL_RGBA;
|
||||
img_fmt.image_channel_data_type = CL_FLOAT;
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = ne00 * ne1 / 4;
|
||||
img_desc.buffer = b_sub_buffer;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
kernel = backend_ctx->kernel_gemv_noshuffle_q6_K_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &ql_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qh_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t local_work_size[3] = {64, 4, 1};
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(ql_img));
|
||||
CL_CHECK(clReleaseMemObject(qh_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
} else {
|
||||
cl_mem b_sub_buf;
|
||||
cl_mem b_buf_trans;
|
||||
cl_mem b_img;
|
||||
cl_mem b_img_trans;
|
||||
|
||||
// subbuffer for activation
|
||||
region.origin = offset1;
|
||||
region.size = ne00 * ne1 * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activation
|
||||
img_fmt.image_channel_order = CL_RGBA;
|
||||
img_fmt.image_channel_data_type = CL_FLOAT;
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = ne00 * ne1 / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = ne1 % 8;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0){
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// subbuffer for transposed activation
|
||||
region.origin = 0;
|
||||
region.size = ne00 * (ne1 + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
CL_CHECK((b_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for transposed activation
|
||||
img_fmt.image_channel_order = CL_RGBA;
|
||||
img_fmt.image_channel_data_type = CL_HALF_FLOAT;
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = ne00 * (ne1 + padding) / 4;
|
||||
img_desc.buffer = b_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// transpose activation
|
||||
int height_B = ne1/4;
|
||||
if (height_B == 0) {
|
||||
height_B = 1;
|
||||
}
|
||||
int width_B = ne00/4;
|
||||
int padded_height_B = (ne1 + padding) / 4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_size_t[2] = { 1, 16 };
|
||||
size_t global_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_gemm_noshuffle_q6_K_f32;
|
||||
int padded_N = ne1 + padding;
|
||||
|
||||
cl_ushort mask_f000 = 0xF000;
|
||||
cl_uchar mask_c0 = 0xC0;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ushort),&mask_f000));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_uchar), &mask_c0));
|
||||
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
|
||||
size_t local_work_size[3] = {2, 128, 1};
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
|
|
@ -9357,6 +9904,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
|
||||
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q4_K * extra0_q4_K = (ggml_tensor_extra_cl_q4_K *)src0->extra;
|
||||
ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
|
||||
#endif
|
||||
|
||||
|
|
@ -9466,6 +10014,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
return;
|
||||
}
|
||||
|
||||
// q6_K x fp32
|
||||
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// q4_0 x fp32
|
||||
if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
|
||||
// TODO: remove duplicate definitions of image description + format -- move to top
|
||||
|
|
@ -10005,6 +10559,50 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
return;
|
||||
}
|
||||
case GGML_TYPE_Q4_K: {
|
||||
if (ne11 < 32) {
|
||||
break;
|
||||
}
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
|
||||
break;
|
||||
}
|
||||
|
||||
kernel = backend_ctx->kernel_mul_mm_q4_k_f32_l4_lm;
|
||||
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
||||
|
||||
int batch_stride_a = ne00*ne01;
|
||||
int batch_stride_b = ne10*ne11;
|
||||
int batch_stride_d = ne0*ne1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_K->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_K->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_K->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_K->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_a
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); // stride_b
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne01)); // stride_d
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_a));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_b));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &batch_stride_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3));
|
||||
|
||||
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
||||
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
||||
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
return;
|
||||
}
|
||||
case GGML_TYPE_Q6_K: {
|
||||
if (ne11 < 32) {
|
||||
break;
|
||||
|
|
@ -10449,6 +11047,43 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K: {
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
kernel = backend_ctx->kernel_mul_mv_q4_K_f32_flat;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
nth0 = 16;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
nth1 = 2;
|
||||
ndst = 16;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_K->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_K->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_K->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_K->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3));
|
||||
#else
|
||||
kernel = backend_ctx->kernel_mul_mv_q4_K_f32;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
|
|
@ -10482,6 +11117,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
|
||||
#endif // GGML_OPENCL_SOA_Q
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q5_K:
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@
|
|||
#define QK8_0 32
|
||||
#define QR8_0 1
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE (3 * QK_K / 64)
|
||||
#define K_QUANTS_PER_ITERATION 2
|
||||
|
||||
typedef char int8_t;
|
||||
|
|
@ -55,6 +56,16 @@ struct block_q4_1 {
|
|||
uchar qs[QK4_1 / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_k
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_K {
|
||||
half d; // delta
|
||||
half dm; // min
|
||||
uchar s[K_SCALE_SIZE];
|
||||
uchar q[QK_K / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q6_K
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
@ -408,6 +419,62 @@ kernel void kernel_restore_block_q8_0_trans(
|
|||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q4_K
|
||||
// Convert the block_q4_K format to 4 separate arrays (AOS -> SOA).
|
||||
// This kernel does not deshuffle the bits.
|
||||
// Each thread processes a super block.
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_convert_block_q4_K(
|
||||
global struct block_q4_K * src0,
|
||||
global uchar * dst_q,
|
||||
global uchar * dst_s,
|
||||
global half * dst_d,
|
||||
global half * dst_dm
|
||||
) {
|
||||
global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
|
||||
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
global half * dm = (global half *) dst_dm + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*dm = b->dm;
|
||||
|
||||
for (int i = 0; i < QK_K/2; ++i) {
|
||||
q[i] = b->q[i];
|
||||
}
|
||||
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
||||
s[i] = b->s[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Restore block_q4_K from flattened arrays.
|
||||
// Each thread processes a super block.
|
||||
kernel void kernel_restore_block_q4_K(
|
||||
global uchar * src_q,
|
||||
global uchar * src_s,
|
||||
global half * src_d,
|
||||
global half * src_dm,
|
||||
global struct block_q4_K * dst
|
||||
) {
|
||||
global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
|
||||
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
global half * dm = (global half *) src_dm + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
b->dm = *dm;
|
||||
|
||||
for (int i = 0; i < QK_K/2; ++i) {
|
||||
b->q[i] = q[i];
|
||||
}
|
||||
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
||||
b->s[i] = s[i];
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q6_K
|
||||
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
|
||||
|
|
@ -419,8 +486,13 @@ kernel void kernel_convert_block_q6_K(
|
|||
global uchar * dst_ql,
|
||||
global uchar * dst_qh,
|
||||
global char * dst_s,
|
||||
global half * dst_d
|
||||
global half * dst_d,
|
||||
uchar mask_lsb_8,
|
||||
ulong n_blk
|
||||
) {
|
||||
if (get_global_id(0) >= n_blk) {
|
||||
return;
|
||||
}
|
||||
global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
|
||||
global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
|
||||
global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
|
||||
|
|
@ -447,8 +519,13 @@ kernel void kernel_restore_block_q6_K(
|
|||
global uchar * dst_qh,
|
||||
global char * dst_s,
|
||||
global half * dst_d,
|
||||
global struct block_q6_K * dst
|
||||
global struct block_q6_K * dst,
|
||||
uchar mask_lsb_8,
|
||||
ulong n_blk
|
||||
) {
|
||||
if (get_global_id(0) >= n_blk) {
|
||||
return;
|
||||
}
|
||||
global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
|
||||
global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
|
||||
global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
|
||||
|
|
@ -467,3 +544,117 @@ kernel void kernel_restore_block_q6_K(
|
|||
b->scales[i] = s[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q6_K_noshuffle(
|
||||
global struct block_q6_K * src0,
|
||||
global uchar * dst_ql,
|
||||
global uchar * dst_qh,
|
||||
global char * dst_s,
|
||||
global half * dst_d,
|
||||
uchar mask_lsb_8,
|
||||
ulong n_blk
|
||||
) {
|
||||
if (get_global_id(0) >= n_blk) {
|
||||
return;
|
||||
}
|
||||
global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
|
||||
global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
|
||||
global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
|
||||
global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
|
||||
for (int i = 0; i < QK_K/2/4; ++i) {
|
||||
uchar x0 = b->ql[i*2 + 0] & mask_lsb_8;
|
||||
uchar x1 = b->ql[i*2 + 1] & mask_lsb_8;
|
||||
ql[i + 0] = (x0 & 0x0F) | ((x1 & 0x0F) << 4);
|
||||
ql[i + 32] = ((x0 & 0xF0) >> 4) | (x1 & 0xF0);
|
||||
|
||||
uchar x2 = b->ql[i*2 + 0 + 64] & mask_lsb_8;
|
||||
uchar x3 = b->ql[i*2 + 1 + 64] & mask_lsb_8;
|
||||
ql[i + 64] = (x2 & 0x0F) | ((x3 & 0x0F) << 4);
|
||||
ql[i + 96] = ((x2 & 0xF0) >> 4) | (x3 & 0xF0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK_K/4/8; ++i) {
|
||||
uchar x0 = b->qh[i*4 + 0] & mask_lsb_8;
|
||||
uchar x1 = b->qh[i*4 + 1] & mask_lsb_8;
|
||||
uchar x2 = b->qh[i*4 + 2] & mask_lsb_8;
|
||||
uchar x3 = b->qh[i*4 + 3] & mask_lsb_8;
|
||||
qh[i + 0] = (x0 & 0x03) | ((x1 & 0x03) << 2) | ((x2 & 0x03) << 4) | ((x3 & 0x03) << 6);
|
||||
qh[i + 8] = ((x0 & 0x0C) >> 2) | (x1 & 0x0C) | ((x2 & 0x0C) << 2) | ((x3 & 0x0C) << 4);
|
||||
qh[i + 16] = ((x0 & 0x30) >> 4) | ((x1 & 0x30) >> 2) | (x2 & 0x30) | ((x3 & 0x30) << 2);
|
||||
qh[i + 24] = ((x0 & 0xC0) >> 6) | ((x1 & 0xC0) >> 4) | ((x2 & 0xC0) >> 2) | (x3 & 0xC0);
|
||||
|
||||
uchar x4 = b->qh[i*4 + 0 + 32] & mask_lsb_8;
|
||||
uchar x5 = b->qh[i*4 + 1 + 32] & mask_lsb_8;
|
||||
uchar x6 = b->qh[i*4 + 2 + 32] & mask_lsb_8;
|
||||
uchar x7 = b->qh[i*4 + 3 + 32] & mask_lsb_8;
|
||||
qh[i + 32] = (x4 & 0x03) | ((x5 & 0x03) << 2) | ((x6 & 0x03) << 4) | ((x7 & 0x03) << 6);
|
||||
qh[i + 40] = ((x4 & 0x0C) >> 2) | (x5 & 0x0C) | ((x6 & 0x0C) << 2) | ((x7 & 0x0C) << 4);
|
||||
qh[i + 48] = ((x4 & 0x30) >> 4) | ((x5 & 0x30) >> 2) | (x6 & 0x30) | ((x7 & 0x30) << 2);
|
||||
qh[i + 56] = ((x4 & 0xC0) >> 6) | ((x5 & 0xC0) >> 4) | ((x6 & 0xC0) >> 2) | (x7 & 0xC0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK_K/16; ++i) {
|
||||
s[i] = b->scales[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q6_K_noshuffle(
|
||||
global uchar * src_ql,
|
||||
global uchar * src_qh,
|
||||
global char * src_s,
|
||||
global half * src_d,
|
||||
global struct block_q6_K * dst,
|
||||
uchar mask_lsb_8,
|
||||
ulong n_blk
|
||||
) {
|
||||
if (get_global_id(0) >= n_blk) {
|
||||
return;
|
||||
}
|
||||
global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
|
||||
global uchar * ql = (global uchar *) src_ql + QK_K/2*get_global_id(0);
|
||||
global uchar * qh = (global uchar *) src_qh + QK_K/4*get_global_id(0);
|
||||
global char * s = (global char *) src_s + QK_K/16*get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
|
||||
for (int i = 0; i < QK_K/2/4; ++i) {
|
||||
uchar x0 = ql[i + 0] & mask_lsb_8;
|
||||
uchar x1 = ql[i + 32] & mask_lsb_8;
|
||||
b->ql[i*2 + 0] = (x0 & 0x0F) | ((x1 & 0x0F) << 4);
|
||||
b->ql[i*2 + 1] = ((x0 & 0xF0) >> 4) | (x1 & 0xF0);
|
||||
|
||||
uchar x2 = ql[i + 64] & mask_lsb_8;
|
||||
uchar x3 = ql[i + 96] & mask_lsb_8;
|
||||
b->ql[i*2 + 0 + 64] = (x2 & 0x0F) | ((x3 & 0x0F) << 4);
|
||||
b->ql[i*2 + 1 + 64] = ((x2 & 0xF0) >> 4) | (x3 & 0xF0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK_K/4/8; ++i) {
|
||||
uchar x0 = qh[i + 0] & mask_lsb_8;
|
||||
uchar x1 = qh[i + 8] & mask_lsb_8;
|
||||
uchar x2 = qh[i + 16] & mask_lsb_8;
|
||||
uchar x3 = qh[i + 24] & mask_lsb_8;
|
||||
b->qh[i*4 + 0] = (x0 & 0x03) | ((x1 & 0x03) << 2) | ((x2 & 0x03) << 4) | ((x3 & 0x03) << 6);
|
||||
b->qh[i*4 + 1] = ((x0 & 0x0C) >> 2) | (x1 & 0x0C) | ((x2 & 0x0C) << 2) | ((x3 & 0x0C) << 4);
|
||||
b->qh[i*4 + 2] = ((x0 & 0x30) >> 4) | ((x1 & 0x30) >> 2) | (x2 & 0x30) | ((x3 & 0x30) << 2);
|
||||
b->qh[i*4 + 3] = ((x0 & 0xC0) >> 6) | ((x1 & 0xC0) >> 4) | ((x2 & 0xC0) >> 2) | (x3 & 0xC0);
|
||||
|
||||
uchar x4 = qh[i + 0 + 32] & mask_lsb_8;
|
||||
uchar x5 = qh[i + 8 + 32] & mask_lsb_8;
|
||||
uchar x6 = qh[i + 16 + 32] & mask_lsb_8;
|
||||
uchar x7 = qh[i + 24 + 32] & mask_lsb_8;
|
||||
b->qh[i*4 + 0 + 32] = (x4 & 0x03) | ((x5 & 0x03) << 2) | ((x6 & 0x03) << 4) | ((x7 & 0x03) << 6);
|
||||
b->qh[i*4 + 1 + 32] = ((x4 & 0x0C) >> 2) | (x5 & 0x0C) | ((x6 & 0x0C) << 2) | ((x7 & 0x0C) << 4);
|
||||
b->qh[i*4 + 2 + 32] = ((x4 & 0x30) >> 4) | ((x5 & 0x30) >> 2) | (x6 & 0x30) | ((x7 & 0x30) << 2);
|
||||
b->qh[i*4 + 3 + 32] = ((x4 & 0xC0) >> 6) | ((x5 & 0xC0) >> 4) | ((x6 & 0xC0) >> 2) | (x7 & 0xC0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK_K/16; ++i) {
|
||||
b->scales[i] = s[i];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,140 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
kernel void kernel_gemm_noshuffle_q6_K_f32(
|
||||
global const ushort * src0_ql,
|
||||
global const uchar * src0_qh,
|
||||
global const ushort * src0_s,
|
||||
global const half * src0_d,
|
||||
read_only image1d_buffer_t src1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
int n_no_padding,
|
||||
ushort mask_f000,
|
||||
uchar mask_c0
|
||||
) {
|
||||
dst = (global float *)( (global char *)dst + offsetd );
|
||||
|
||||
int m_4 = m >> 2;
|
||||
int n_4 = n >> 2;
|
||||
|
||||
int gy = get_global_id(0); // n
|
||||
int gx = get_global_id(1); // m
|
||||
int gx_2 = gx << 2;
|
||||
|
||||
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
|
||||
half8 B;
|
||||
half4 dequantized_weights;
|
||||
|
||||
global const ushort * ptr_ql = src0_ql + gx_2;
|
||||
global const uchar * ptr_qh = src0_qh + gx_2;
|
||||
global const ushort * ptr_s = src0_s + gx_2;
|
||||
global const half * ptr_d = src0_d + gx_2;
|
||||
|
||||
for (int i = 0; i < k; i += 4) {
|
||||
// load 4x elements (ushort) of ql on M, each ushort contains 4 weights
|
||||
// 4x ushort correspons to 4 rows on M
|
||||
ushort4 bits4 = vload4(0, ptr_ql + (i/4)*m); // ql packed in 4s in ushort
|
||||
uchar4 bits2 = vload4(0, ptr_qh + (i/4)*m); // qh packed in 4s in uchar
|
||||
|
||||
// load 4 consecutive scales
|
||||
char8 scale_s_8 = as_char8(vload4(0, ptr_s + (i/16/2)*m)); // 1 char scale every 16 elements, packed in 2s
|
||||
char4 scale_s = ((i/16) % 2) == 0 ? scale_s_8.s0246 : scale_s_8.s1357; // transposed as ushort, 2 blocks
|
||||
half4 scale_d = vload4(0, ptr_d + (i/256)*m); // 1 half scale every 256 elements
|
||||
|
||||
// j=0
|
||||
// load 2x 4 elements of activations on N, corresponding to 8 rows on N
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i + 0)*n_4 + 0);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i + 0)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half((bits4.s0 & 0x000F) | ((bits2.s0 & 0x03) << 4)) - 32.f) * scale_s.s0 * scale_d.s0;
|
||||
dequantized_weights.s1 = (convert_half((bits4.s1 & 0x000F) | ((bits2.s1 & 0x03) << 4)) - 32.f) * scale_s.s1 * scale_d.s1;
|
||||
dequantized_weights.s2 = (convert_half((bits4.s2 & 0x000F) | ((bits2.s2 & 0x03) << 4)) - 32.f) * scale_s.s2 * scale_d.s2;
|
||||
dequantized_weights.s3 = (convert_half((bits4.s3 & 0x000F) | ((bits2.s3 & 0x03) << 4)) - 32.f) * scale_s.s3 * scale_d.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=1
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i + 1)*n_4 + 0);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i + 1)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half((((bits4.s0 & 0x00F0) >> 4) | ((bits2.s0 & 0x0C) << 2))) - 32.f) * scale_s.s0 * scale_d.s0;
|
||||
dequantized_weights.s1 = (convert_half((((bits4.s1 & 0x00F0) >> 4) | ((bits2.s1 & 0x0C) << 2))) - 32.f) * scale_s.s1 * scale_d.s1;
|
||||
dequantized_weights.s2 = (convert_half((((bits4.s2 & 0x00F0) >> 4) | ((bits2.s2 & 0x0C) << 2))) - 32.f) * scale_s.s2 * scale_d.s2;
|
||||
dequantized_weights.s3 = (convert_half((((bits4.s3 & 0x00F0) >> 4) | ((bits2.s3 & 0x0C) << 2))) - 32.f) * scale_s.s3 * scale_d.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=2
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i + 2)*n_4 + 0);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i + 2)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half((((bits4.s0 & 0x0F00) >> 8) | (bits2.s0 & 0x30))) - 32.f) * scale_s.s0 * scale_d.s0;
|
||||
dequantized_weights.s1 = (convert_half((((bits4.s1 & 0x0F00) >> 8) | (bits2.s1 & 0x30))) - 32.f) * scale_s.s1 * scale_d.s1;
|
||||
dequantized_weights.s2 = (convert_half((((bits4.s2 & 0x0F00) >> 8) | (bits2.s2 & 0x30))) - 32.f) * scale_s.s2 * scale_d.s2;
|
||||
dequantized_weights.s3 = (convert_half((((bits4.s3 & 0x0F00) >> 8) | (bits2.s3 & 0x30))) - 32.f) * scale_s.s3 * scale_d.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=3
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i + 3)*n_4 + 0);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i + 3)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half((((bits4.s0 & mask_f000) >> 12) | ((bits2.s0 & mask_c0) >> 2))) - 32.f) * scale_s.s0 * scale_d.s0;
|
||||
dequantized_weights.s1 = (convert_half((((bits4.s1 & mask_f000) >> 12) | ((bits2.s1 & mask_c0) >> 2))) - 32.f) * scale_s.s1 * scale_d.s1;
|
||||
dequantized_weights.s2 = (convert_half((((bits4.s2 & mask_f000) >> 12) | ((bits2.s2 & mask_c0) >> 2))) - 32.f) * scale_s.s2 * scale_d.s2;
|
||||
dequantized_weights.s3 = (convert_half((((bits4.s3 & mask_f000) >> 12) | ((bits2.s3 & mask_c0) >> 2))) - 32.f) * scale_s.s3 * scale_d.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
}
|
||||
|
||||
int idx = (gy<<3)*m + (gx<<2);
|
||||
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,293 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define NSUBGROUPS 4
|
||||
#define SUBGROUP_SIZE 64
|
||||
|
||||
#define dequantize_block_acc_bcast_8_hi(total_sum, bits4, bits2, scale_d, scale_s, y) \
|
||||
float8 shared_y; \
|
||||
shared_y = sub_group_broadcast(y, 0); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x000F) ) | ((bits2.s0 & 0x03) << 4)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s0; \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x00F0) >> 4) | ((bits2.s0 & 0x0C) << 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s1; \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x0F00) >> 8) | ((bits2.s0 & 0x30) )) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s2; \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0xF000) >> 12) | ((bits2.s0 & 0xC0) >> 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s3; \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x000F) ) | ((bits2.s2 & 0x03) << 4)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s4; \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x00F0) >> 4) | ((bits2.s2 & 0x0C) << 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s5; \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x0F00) >> 8) | ((bits2.s2 & 0x30) )) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s6; \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0xF000) >> 12) | ((bits2.s2 & 0xC0) >> 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s7; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x000F) ) | ((bits2.s1 & 0x03) << 4)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s0; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x00F0) >> 4) | ((bits2.s1 & 0x0C) << 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s1; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x0F00) >> 8) | ((bits2.s1 & 0x30) )) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s2; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0xF000) >> 12) | ((bits2.s1 & 0xC0) >> 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s3; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x000F) ) | ((bits2.s3 & 0x03) << 4)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s4; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x00F0) >> 4) | ((bits2.s3 & 0x0C) << 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s5; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x0F00) >> 8) | ((bits2.s3 & 0x30) )) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s6; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0xF000) >> 12) | ((bits2.s3 & 0xC0) >> 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 1); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x000F) ) | ((bits2.s4 & 0x03) << 4)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s0; \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x00F0) >> 4) | ((bits2.s4 & 0x0C) << 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s1; \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x0F00) >> 8) | ((bits2.s4 & 0x30) )) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s2; \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0xF000) >> 12) | ((bits2.s4 & 0xC0) >> 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s3; \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x000F) ) | ((bits2.s6 & 0x03) << 4)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s4; \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x00F0) >> 4) | ((bits2.s6 & 0x0C) << 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s5; \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x0F00) >> 8) | ((bits2.s6 & 0x30) )) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s6; \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0xF000) >> 12) | ((bits2.s6 & 0xC0) >> 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y.s7; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x000F) ) | ((bits2.s5 & 0x03) << 4)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s0; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x00F0) >> 4) | ((bits2.s5 & 0x0C) << 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s1; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x0F00) >> 8) | ((bits2.s5 & 0x30) )) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s2; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0xF000) >> 12) | ((bits2.s5 & 0xC0) >> 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s3; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x000F) ) | ((bits2.s7 & 0x03) << 4)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s4; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x00F0) >> 4) | ((bits2.s7 & 0x0C) << 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s5; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x0F00) >> 8) | ((bits2.s7 & 0x30) )) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s6; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0xF000) >> 12) | ((bits2.s7 & 0xC0) >> 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y.s7; \
|
||||
|
||||
#define dequantize_block_acc_bcast_8_lo(total_sum, bits4, bits2, scale_d, scale_s, y) \
|
||||
shared_y = sub_group_broadcast(y, 2); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x000F) ) | ((bits2.s0 & 0x03) << 4)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s0; \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x00F0) >> 4) | ((bits2.s0 & 0x0C) << 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s1; \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x0F00) >> 8) | ((bits2.s0 & 0x30) )) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s2; \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0xF000) >> 12) | ((bits2.s0 & 0xC0) >> 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s3; \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x000F) ) | ((bits2.s2 & 0x03) << 4)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s4; \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x00F0) >> 4) | ((bits2.s2 & 0x0C) << 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s5; \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x0F00) >> 8) | ((bits2.s2 & 0x30) )) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s6; \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0xF000) >> 12) | ((bits2.s2 & 0xC0) >> 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s7; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x000F) ) | ((bits2.s1 & 0x03) << 4)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s0; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x00F0) >> 4) | ((bits2.s1 & 0x0C) << 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s1; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x0F00) >> 8) | ((bits2.s1 & 0x30) )) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s2; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0xF000) >> 12) | ((bits2.s1 & 0xC0) >> 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s3; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x000F) ) | ((bits2.s3 & 0x03) << 4)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s4; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x00F0) >> 4) | ((bits2.s3 & 0x0C) << 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s5; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x0F00) >> 8) | ((bits2.s3 & 0x30) )) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s6; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0xF000) >> 12) | ((bits2.s3 & 0xC0) >> 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 3); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x000F) ) | ((bits2.s4 & 0x03) << 4)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s0; \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x00F0) >> 4) | ((bits2.s4 & 0x0C) << 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s1; \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x0F00) >> 8) | ((bits2.s4 & 0x30) )) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s2; \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0xF000) >> 12) | ((bits2.s4 & 0xC0) >> 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s3; \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x000F) ) | ((bits2.s6 & 0x03) << 4)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s4; \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x00F0) >> 4) | ((bits2.s6 & 0x0C) << 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s5; \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x0F00) >> 8) | ((bits2.s6 & 0x30) )) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s6; \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0xF000) >> 12) | ((bits2.s6 & 0xC0) >> 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y.s7; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x000F) ) | ((bits2.s5 & 0x03) << 4)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s0; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x00F0) >> 4) | ((bits2.s5 & 0x0C) << 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s1; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x0F00) >> 8) | ((bits2.s5 & 0x30) )) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s2; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0xF000) >> 12) | ((bits2.s5 & 0xC0) >> 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s3; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x000F) ) | ((bits2.s7 & 0x03) << 4)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s4; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x00F0) >> 4) | ((bits2.s7 & 0x0C) << 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s5; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x0F00) >> 8) | ((bits2.s7 & 0x30) )) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s6; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0xF000) >> 12) | ((bits2.s7 & 0xC0) >> 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y.s7; \
|
||||
|
||||
#define dequantize_block_acc_bcast_1_hi(total_sum, bits4, bits2, scale_d, scale_s, y) \
|
||||
float shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x000F) ) | ((bits2.s0 & 0x03) << 4)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x000F) ) | ((bits2.s1 & 0x03) << 4)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x00F0) >> 4) | ((bits2.s0 & 0x0C) << 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x00F0) >> 4) | ((bits2.s1 & 0x0C) << 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x0F00) >> 8) | ((bits2.s0 & 0x30) )) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x0F00) >> 8) | ((bits2.s1 & 0x30) )) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0xF000) >> 12) | ((bits2.s0 & 0xC0) >> 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0xF000) >> 12) | ((bits2.s1 & 0xC0) >> 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x000F) ) | ((bits2.s2 & 0x03) << 4)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x000F) ) | ((bits2.s3 & 0x03) << 4)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x00F0) >> 4) | ((bits2.s2 & 0x0C) << 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x00F0) >> 4) | ((bits2.s3 & 0x0C) << 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x0F00) >> 8) | ((bits2.s2 & 0x30) )) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x0F00) >> 8) | ((bits2.s3 & 0x30) )) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0xF000) >> 12) | ((bits2.s2 & 0xC0) >> 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0xF000) >> 12) | ((bits2.s3 & 0xC0) >> 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x000F) ) | ((bits2.s4 & 0x03) << 4)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x000F) ) | ((bits2.s5 & 0x03) << 4)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x00F0) >> 4) | ((bits2.s4 & 0x0C) << 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x00F0) >> 4) | ((bits2.s5 & 0x0C) << 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x0F00) >> 8) | ((bits2.s4 & 0x30) )) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x0F00) >> 8) | ((bits2.s5 & 0x30) )) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0xF000) >> 12) | ((bits2.s4 & 0xC0) >> 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0xF000) >> 12) | ((bits2.s5 & 0xC0) >> 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x000F) ) | ((bits2.s6 & 0x03) << 4)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x000F) ) | ((bits2.s7 & 0x03) << 4)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x00F0) >> 4) | ((bits2.s6 & 0x0C) << 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x00F0) >> 4) | ((bits2.s7 & 0x0C) << 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x0F00) >> 8) | ((bits2.s6 & 0x30) )) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x0F00) >> 8) | ((bits2.s7 & 0x30) )) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0xF000) >> 12) | ((bits2.s6 & 0xC0) >> 2)) - 32.f) * scale_s.s0 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0xF000) >> 12) | ((bits2.s7 & 0xC0) >> 2)) - 32.f) * scale_s.s2 * scale_d.s1 * shared_y; \
|
||||
|
||||
#define dequantize_block_acc_bcast_1_lo(total_sum, bits4, bits2, scale_d, scale_s, y) \
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x000F) ) | ((bits2.s0 & 0x03) << 4)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x000F) ) | ((bits2.s1 & 0x03) << 4)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x00F0) >> 4) | ((bits2.s0 & 0x0C) << 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x00F0) >> 4) | ((bits2.s1 & 0x0C) << 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0x0F00) >> 8) | ((bits2.s0 & 0x30) )) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0x0F00) >> 8) | ((bits2.s1 & 0x30) )) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
total_sum.s0 += ((float)(((bits4.s0 & 0xF000) >> 12) | ((bits2.s0 & 0xC0) >> 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s1 & 0xF000) >> 12) | ((bits2.s1 & 0xC0) >> 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x000F) ) | ((bits2.s2 & 0x03) << 4)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x000F) ) | ((bits2.s3 & 0x03) << 4)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x00F0) >> 4) | ((bits2.s2 & 0x0C) << 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x00F0) >> 4) | ((bits2.s3 & 0x0C) << 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0x0F00) >> 8) | ((bits2.s2 & 0x30) )) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0x0F00) >> 8) | ((bits2.s3 & 0x30) )) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
total_sum.s0 += ((float)(((bits4.s2 & 0xF000) >> 12) | ((bits2.s2 & 0xC0) >> 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s3 & 0xF000) >> 12) | ((bits2.s3 & 0xC0) >> 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x000F) ) | ((bits2.s4 & 0x03) << 4)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x000F) ) | ((bits2.s5 & 0x03) << 4)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x00F0) >> 4) | ((bits2.s4 & 0x0C) << 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x00F0) >> 4) | ((bits2.s5 & 0x0C) << 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0x0F00) >> 8) | ((bits2.s4 & 0x30) )) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0x0F00) >> 8) | ((bits2.s5 & 0x30) )) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
total_sum.s0 += ((float)(((bits4.s4 & 0xF000) >> 12) | ((bits2.s4 & 0xC0) >> 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s5 & 0xF000) >> 12) | ((bits2.s5 & 0xC0) >> 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x000F) ) | ((bits2.s6 & 0x03) << 4)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x000F) ) | ((bits2.s7 & 0x03) << 4)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x00F0) >> 4) | ((bits2.s6 & 0x0C) << 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x00F0) >> 4) | ((bits2.s7 & 0x0C) << 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0x0F00) >> 8) | ((bits2.s6 & 0x30) )) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0x0F00) >> 8) | ((bits2.s7 & 0x30) )) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
total_sum.s0 += ((float)(((bits4.s6 & 0xF000) >> 12) | ((bits2.s6 & 0xC0) >> 2)) - 32.f) * scale_s.s1 * scale_d.s0 * shared_y; \
|
||||
total_sum.s1 += ((float)(((bits4.s7 & 0xF000) >> 12) | ((bits2.s7 & 0xC0) >> 2)) - 32.f) * scale_s.s3 * scale_d.s1 * shared_y; \
|
||||
|
||||
#if defined(ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_gemv_noshuffle_q6_K_f32(
|
||||
read_only image1d_buffer_t src0_ql,
|
||||
read_only image1d_buffer_t src0_qh,
|
||||
global half2 * src0_s,
|
||||
global half2 * src0_d,
|
||||
read_only image1d_buffer_t src1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01
|
||||
) {
|
||||
int grp = get_local_id(1);
|
||||
int gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
int nb = ne00 / 32;
|
||||
|
||||
uint4 reg_a_l;
|
||||
ushort4 reg_a_h;
|
||||
half2 reg_d;
|
||||
char4 reg_s;
|
||||
float8 reg_b;
|
||||
|
||||
float2 total_sum = 0.0f;
|
||||
|
||||
int line_stride_a = ne01 / 2;
|
||||
int block_stride_a = NSUBGROUPS * ne01;
|
||||
|
||||
for (int k = grp; k < nb; k += NSUBGROUPS) {
|
||||
reg_d = src0_d[gid + k/8 * line_stride_a];
|
||||
reg_s = as_char4(src0_s[gid + k * line_stride_a]);
|
||||
|
||||
if (slid < 4) {
|
||||
reg_b.s0123 = read_imagef(src1, 0 + slid*2 + k*8);
|
||||
reg_b.s4567 = read_imagef(src1, 1 + slid*2 + k*8);
|
||||
}
|
||||
|
||||
reg_a_l.s0 = read_imageui(src0_ql, gid + k*block_stride_a + line_stride_a*0).x;
|
||||
reg_a_l.s1 = read_imageui(src0_ql, gid + k*block_stride_a + line_stride_a*1).x;
|
||||
reg_a_l.s2 = read_imageui(src0_ql, gid + k*block_stride_a + line_stride_a*2).x;
|
||||
reg_a_l.s3 = read_imageui(src0_ql, gid + k*block_stride_a + line_stride_a*3).x;
|
||||
|
||||
reg_a_h.s0 = as_ushort(read_imageh(src0_qh, gid + k*block_stride_a + line_stride_a*0).x);
|
||||
reg_a_h.s1 = as_ushort(read_imageh(src0_qh, gid + k*block_stride_a + line_stride_a*1).x);
|
||||
reg_a_h.s2 = as_ushort(read_imageh(src0_qh, gid + k*block_stride_a + line_stride_a*2).x);
|
||||
reg_a_h.s3 = as_ushort(read_imageh(src0_qh, gid + k*block_stride_a + line_stride_a*3).x);
|
||||
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||
dequantize_block_acc_bcast_8_hi(total_sum, as_ushort8(reg_a_l), as_uchar8(reg_a_h), reg_d, reg_s, reg_b);
|
||||
#else
|
||||
dequantize_block_acc_bcast_1_hi(total_sum, as_ushort8(reg_a_l), as_uchar8(reg_a_h), reg_d, reg_s, reg_b);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||
|
||||
reg_a_l.s0 = read_imageui(src0_ql, gid + k*block_stride_a + line_stride_a*4).x;
|
||||
reg_a_l.s1 = read_imageui(src0_ql, gid + k*block_stride_a + line_stride_a*5).x;
|
||||
reg_a_l.s2 = read_imageui(src0_ql, gid + k*block_stride_a + line_stride_a*6).x;
|
||||
reg_a_l.s3 = read_imageui(src0_ql, gid + k*block_stride_a + line_stride_a*7).x;
|
||||
|
||||
reg_a_h.s0 = as_ushort(read_imageh(src0_qh, gid + k*block_stride_a + line_stride_a*4).x);
|
||||
reg_a_h.s1 = as_ushort(read_imageh(src0_qh, gid + k*block_stride_a + line_stride_a*5).x);
|
||||
reg_a_h.s2 = as_ushort(read_imageh(src0_qh, gid + k*block_stride_a + line_stride_a*6).x);
|
||||
reg_a_h.s3 = as_ushort(read_imageh(src0_qh, gid + k*block_stride_a + line_stride_a*7).x);
|
||||
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||
dequantize_block_acc_bcast_8_lo(total_sum, as_ushort8(reg_a_l), as_uchar8(reg_a_h), reg_d, reg_s, reg_b);
|
||||
#else
|
||||
dequantize_block_acc_bcast_1_lo(total_sum, as_ushort8(reg_a_l), as_uchar8(reg_a_h), reg_d, reg_s, reg_b);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||
}
|
||||
|
||||
local float2 reduce_lm[SUBGROUP_SIZE * 3];
|
||||
if (grp == 1) {
|
||||
reduce_lm[SUBGROUP_SIZE*0 + slid] = total_sum;
|
||||
}
|
||||
if (grp == 2) {
|
||||
reduce_lm[SUBGROUP_SIZE*1 + slid] = total_sum;
|
||||
}
|
||||
if (grp == 3) {
|
||||
reduce_lm[SUBGROUP_SIZE*2 + slid] = total_sum;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (grp == 0) {
|
||||
total_sum += reduce_lm[SUBGROUP_SIZE*0 + slid];
|
||||
}
|
||||
if (grp == 0) {
|
||||
total_sum += reduce_lm[SUBGROUP_SIZE*1 + slid];
|
||||
}
|
||||
if (grp == 0) {
|
||||
total_sum += reduce_lm[SUBGROUP_SIZE*2 + slid];
|
||||
}
|
||||
|
||||
if (grp == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
vstore2(total_sum, 0, &(dst[gid * 2]));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,179 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define LOAD_VEC_A 4
|
||||
#define LOAD_VEC_B 4
|
||||
|
||||
#define BM 64
|
||||
#define BN 64
|
||||
#define BK 32
|
||||
#define TM 4
|
||||
#define TN 8
|
||||
|
||||
kernel void kernel_mul_mm_q4_k_f32_l4_lm(
|
||||
global uchar4 * src0_q,
|
||||
global uchar * src0_s,
|
||||
global half * src0_d,
|
||||
global half * src0_dm,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne11,
|
||||
int ne12,
|
||||
|
||||
int stride_a,
|
||||
int stride_b,
|
||||
int stride_d,
|
||||
|
||||
int batch_stride_a,
|
||||
int batch_stride_b,
|
||||
int batch_stride_d,
|
||||
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float *)((global char*)dst + offsetd);
|
||||
|
||||
local float buf_a[BM * BK];
|
||||
local float buf_b[BN * BK];
|
||||
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
const int i13 = batch_idx / ne12;
|
||||
const int i12 = batch_idx % ne12;
|
||||
|
||||
const int i03 = i13 / r3;
|
||||
const int i02 = i12 / r2;
|
||||
|
||||
const int batch_idx_a = i03 * ne02 + i02;
|
||||
|
||||
const int ir = get_group_id(0);
|
||||
const int ic = get_group_id(1);
|
||||
|
||||
const int tid = get_local_id(0);
|
||||
const int th_r = tid % (BM / TM);
|
||||
const int th_c = tid / (BM / TM);
|
||||
|
||||
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
|
||||
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
|
||||
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
|
||||
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
|
||||
|
||||
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
|
||||
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
|
||||
|
||||
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
|
||||
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
|
||||
|
||||
float sums[TM * TN];
|
||||
float cache_a[TM];
|
||||
float cache_b[TN];
|
||||
|
||||
for (int i = 0; i < TM * TN; i++) {
|
||||
sums[i] = 0.0f;
|
||||
}
|
||||
|
||||
for (int block = 0; block < ne00; block += BK) {
|
||||
for (int l = 0; l < BM; l += loadstride_a) {
|
||||
if (ir*BM + loadc_a + l < ne01) {
|
||||
int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
|
||||
int ib = idx / 64;
|
||||
int iqs = (idx % 64) * 2;
|
||||
|
||||
int n = iqs / 32;
|
||||
int b = (iqs % 32) / 16;
|
||||
int is = 2 * n + b;
|
||||
int qsi = n * 32 + (iqs % 16) * 2;
|
||||
|
||||
char * scales = src0_s + ib * 12;
|
||||
|
||||
int scidx0 = (is < 4) ? is : (is + 4);
|
||||
int scidx1 = (is < 4) ? is : (is - 4);
|
||||
int scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
int scidxshift1 = (is < 4) ? 0 : 2;
|
||||
int mbidx0 = is + 4;
|
||||
int mbidx1 = (is < 4) ? is + 4 : is;
|
||||
int mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
int mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
int mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
int mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
uchar sc = (scales[scidx0] & 0xF) | ((scales[scidx1] & scidxmask1) >> scidxshift1);
|
||||
uchar mbyte = ((scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((scales[mbidx1] & mbidxmask1) >> mbidxshift1);
|
||||
|
||||
float d = (float)src0_d[ib] * (float)sc;
|
||||
float m = -(float)src0_dm[ib] * (float)mbyte;
|
||||
|
||||
global uchar4 * qs = src0_q + ib*32 + (qsi >> 2);
|
||||
uchar4 q = *qs;
|
||||
float4 v1 = (convert_float4((uchar4)((q.s0 >> (b * 4))&0x0F, (q.s1 >> (b * 4))&0x0F, (q.s2 >> (b * 4))&0x0F, (q.s3 >> (b * 4))&0x0F)))*d + m;
|
||||
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = v1.s0;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = v1.s1;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = v1.s2;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = v1.s3;
|
||||
} else {
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
for (int l = 0; l < BN; l += loadstride_b) {
|
||||
if (ic*BN + loadc_b + l < ne11) {
|
||||
int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
|
||||
} else {
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
pos_a += BK / LOAD_VEC_A;
|
||||
pos_b += BK / LOAD_VEC_B;
|
||||
|
||||
for (int i = 0; i < BK; i++) {
|
||||
for (int j = 0; j < TM; j++) {
|
||||
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < TN; j++) {
|
||||
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
|
||||
}
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
const int sums_idx = cc*TM + cr;
|
||||
sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int dr = ir * BM + th_r * TM;
|
||||
const int dc = ic * BN + th_c * TN;
|
||||
|
||||
const int offsets = batch_idx * batch_stride_d;
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
if (dr + cr < ne01 && dc + cc < ne11) {
|
||||
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,196 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_K
|
||||
//------------------------------------------------------------------------------
|
||||
#define QK_K 256
|
||||
#define BLOCK_Q4K_SIZE 144
|
||||
#define K_SCALE_SIZE 12
|
||||
|
||||
// 8 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
|
||||
uchar scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uchar qs[QK_K/2]; // 4-bit quants
|
||||
} block_q4_K;
|
||||
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 4 // number of rows each SIMD group works on
|
||||
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 16 // SIMD group size
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 16
|
||||
#define N_SIMDGROUP 2
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
#undef BLOCK_STRIDE
|
||||
// number of (super) blocks each subgroup processes
|
||||
// each thread in a subgroup processes a block (32 weights)
|
||||
#define BLOCK_STRIDE (N_SIMDWIDTH/8)
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_q4_K_f32_flat(
|
||||
global uchar * src0_q,
|
||||
global uchar * src0_s,
|
||||
global half * src0_d,
|
||||
global half * src0_dm,
|
||||
global char * src1,
|
||||
int offset1,
|
||||
global char * dst,
|
||||
int offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne12,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
ushort kmask1 = 0x3f3f;
|
||||
ushort kmask2 = 0x0f0f;
|
||||
ushort kmask3 = 0xc0c0;
|
||||
|
||||
int ix = get_sub_group_local_id()/8;
|
||||
int it = get_sub_group_local_id()%8;
|
||||
int iq = it/4;
|
||||
int ir = it%4;
|
||||
|
||||
int nb = ne00/QK_K;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
int offset_src0 = (first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03)/BLOCK_Q4K_SIZE;
|
||||
uint blk = nb01 / BLOCK_Q4K_SIZE;
|
||||
global uchar * blk_q = (global uchar *)src0_q + offset_src0*(QK_K/2);
|
||||
global uchar * blk_s = (global uchar *)src0_s + offset_src0*K_SCALE_SIZE;
|
||||
global half * blk_d = (global half *)src0_d + offset_src0;
|
||||
global half * blk_dm = (global half *)src0_dm + offset_src0;
|
||||
|
||||
int offset_src1 = r1*nb11 + (i12)*nb12 + (i13)*nb13;
|
||||
global float * y = (global float *)(src1 + offset_src1);
|
||||
|
||||
float yl[16];
|
||||
float yh[16];
|
||||
float sumf[N_DST] = {0.f};
|
||||
float all_sum;
|
||||
|
||||
global float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
|
||||
|
||||
ushort sc16[4];
|
||||
uchar * sc8 = (uchar *)sc16;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
|
||||
float4 sumy = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i+0] = y4[i+0];
|
||||
sumy.s0 += yl[i+0];
|
||||
|
||||
yl[i+8] = y4[i+32];
|
||||
sumy.s1 += yl[i+8];
|
||||
|
||||
yh[i+0] = y4[i+128];
|
||||
sumy.s2 += yh[i+0];
|
||||
|
||||
yh[i+8] = y4[i+160];
|
||||
sumy.s3 += yh[i+8];
|
||||
}
|
||||
|
||||
global ushort * q1 = (global ushort *)(blk_q + ib * (QK_K/2)) + (16 * iq + 4 * ir);
|
||||
global ushort * sc = (global ushort *)(blk_s + ib * K_SCALE_SIZE) + iq;
|
||||
global half * d = blk_d + ib;
|
||||
global half * dm = blk_dm + ib;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
sc16[0] = sc[0] & kmask1;
|
||||
sc16[1] = sc[2] & kmask1;
|
||||
sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
|
||||
sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
|
||||
|
||||
global ushort * q2 = q1 + 32;
|
||||
|
||||
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
|
||||
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
acc1.s0 += yl[i+0] * (q1[i/2] & 0x000F);
|
||||
acc1.s1 += yl[i+1] * (q1[i/2] & 0x0F00);
|
||||
acc1.s2 += yl[i+8] * (q1[i/2] & 0x00F0);
|
||||
acc1.s3 += yl[i+9] * (q1[i/2] & 0xF000);
|
||||
acc2.s0 += yh[i+0] * (q2[i/2] & 0x000F);
|
||||
acc2.s1 += yh[i+1] * (q2[i/2] & 0x0F00);
|
||||
acc2.s2 += yh[i+8] * (q2[i/2] & 0x00F0);
|
||||
acc2.s3 += yh[i+9] * (q2[i/2] & 0xF000);
|
||||
}
|
||||
|
||||
float dall = *d;
|
||||
float dmin = *dm;
|
||||
sumf[row] += dall * ((acc1.s0 + 1.f/256.f * acc1.s1) * sc8[0] +
|
||||
(acc1.s2 + 1.f/256.f * acc1.s3) * sc8[1] * 1.f/16.f +
|
||||
(acc2.s0 + 1.f/256.f * acc2.s1) * sc8[4] +
|
||||
(acc2.s2 + 1.f/256.f * acc2.s3) * sc8[5] * 1.f/16.f) -
|
||||
dmin * (sumy.s0 * sc8[2] + sumy.s1 * sc8[3] + sumy.s2 * sc8[6] + sumy.s3 * sc8[7]);
|
||||
|
||||
q1 += blk*64;
|
||||
sc += blk*6;
|
||||
d += blk;
|
||||
dm += blk;
|
||||
}
|
||||
|
||||
y4 += BLOCK_STRIDE * QK_K;
|
||||
}
|
||||
|
||||
global float * dst_f32 = (global float *) dst + im*ne0*ne1 + r1*ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = sub_group_reduce_add(sumf[row]);
|
||||
if (first_row + row < ne01) {
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -97,6 +97,8 @@ struct ggml_backend_openvino_buffer_context {
|
|||
ov_buffer = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
|
||||
} else {
|
||||
data = ggml_aligned_malloc(size);
|
||||
GGML_ASSERT(data);
|
||||
memset(data, 0, size);
|
||||
ov_buffer = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape{size}, data);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -589,8 +589,10 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|||
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
result.buffer = ctx != nullptr ? ctx->remote_ptr : 0;
|
||||
result.data = reinterpret_cast<uint64_t>(tensor->data);
|
||||
} else {
|
||||
result.buffer = 0;
|
||||
result.data = 0;
|
||||
}
|
||||
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
result.ne[i] = tensor->ne[i];
|
||||
|
|
@ -606,7 +608,6 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|||
}
|
||||
result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
|
||||
result.view_offs = tensor->view_offs;
|
||||
result.data = reinterpret_cast<uint64_t>(tensor->data);
|
||||
|
||||
// Avoid sending uninitialized data over the wire
|
||||
memset(result.name, 0, sizeof(result.name));
|
||||
|
|
@ -1162,12 +1163,18 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
// Fix: Prevent division by zero if blck_size is 0 (e.g., deprecated types)
|
||||
if (ggml_blck_size((enum ggml_type)tensor->type) == 0) {
|
||||
GGML_LOG_ERROR("[%s] invalid tensor type received (blck_size is 0): %u\n", __func__, tensor->type);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
|
||||
// ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type
|
||||
if (result == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type);
|
||||
GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\n", __func__, tensor->type);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -1440,6 +1447,10 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
|
|||
if (result == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (result->buffer == nullptr && result->data != nullptr) {
|
||||
GGML_LOG_ERROR("[%s] invalid data ptr", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
tensor_map[id] = result;
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
// Check if the source ID is 0 before calling create_node recursively
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue