Spaces:

natasa365
/

whisper.cpp

Running

App Files Files Community

ggerganov commited on Jul 30

Commit

d321914

unverified ·

1 Parent(s): 844e617

ggml : remove old kompute, cann (skip) (#3349)

Browse files

Files changed (50) hide show

ggml/include/ggml-kompute.h +0 -50
ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
ggml/src/ggml-cann/kernels/dup.cpp +0 -234
ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
ggml/src/ggml-kompute/CMakeLists.txt +0 -166
ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71

ggml/include/ggml-kompute.h DELETED Viewed

@@ -1,50 +0,0 @@
-#pragma once
-#include "ggml.h"
-#include "ggml-backend.h"
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define GGML_KOMPUTE_MAX_DEVICES 16
-struct ggml_vk_device {
-    int index;
-    int type; // same as VkPhysicalDeviceType
-    size_t heapSize;
-    const char * name;
-    const char * vendor;
-    int subgroupSize;
-    uint64_t bufferAlignment;
-    uint64_t maxAlloc;
-};
-struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
-bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
-bool ggml_vk_has_vulkan(void);
-bool ggml_vk_has_device(void);
-struct ggml_vk_device ggml_vk_current_device(void);
-//
-// backend API
-//
-// forward declaration
-typedef struct ggml_backend * ggml_backend_t;
-GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
-GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
-#ifdef __cplusplus
-}
-#endif

ggml/src/ggml-cann/kernels/CMakeLists.txt DELETED Viewed

@@ -1,30 +0,0 @@
-file(GLOB SRC_FILES
-    get_row_f32.cpp
-    get_row_f16.cpp
-    get_row_q4_0.cpp
-    get_row_q8_0.cpp
-    quantize_f32_q8_0.cpp
-    quantize_f16_q8_0.cpp
-    quantize_float_to_q4_0.cpp
-    dup.cpp
-)
-set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
-set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
-if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-else()
-    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
-endif()
-include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
-ascendc_library(ascendc_kernels STATIC
-    ${SRC_FILES}
-)
-message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
-ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
-# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

ggml/src/ggml-cann/kernels/ascendc_kernels.h DELETED Viewed

@@ -1,19 +0,0 @@
-#ifndef ASCENDC_KERNELS_H
-#define ASCENDC_KERNELS_H
-#include "aclrtlaunch_ascendc_get_row_f32.h"
-#include "aclrtlaunch_ascendc_get_row_f16.h"
-#include "aclrtlaunch_ascendc_get_row_q8_0.h"
-#include "aclrtlaunch_ascendc_get_row_q4_0.h"
-#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
-#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
-#endif  // ASCENDC_KERNELS_H

ggml/src/ggml-cann/kernels/dup.cpp DELETED Viewed

@@ -1,234 +0,0 @@
-#include "kernel_operator.h"
-using namespace AscendC;
-#define BUFFER_NUM 2
-const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
-template <typename SRC_T, typename DST_T>
-class DupByRows {
-   public:
-    __aicore__ inline DupByRows() {}
-    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
-                                size_t *input_nb_ub) {
-        /* Dup by rows when src is contigous on first dimension and dst is
-        contiguous, each kernel process one row.
-        */
-        // Input has four dims.
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-        // param
-        num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
-        num_elem = input_ne_ub[0];
-        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
-        idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
-        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
-                  / (input_ne_ub[1]);
-        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
-                - idx_ne2 * input_ne_ub[1];
-        // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
-        src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
-                     + input_nb_ub[1] * idx_ne1;
-        // dst is contiguous
-        dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
-        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
-                                                                src_stride));
-        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
-                                                                dst_stride));
-        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
-                                                32 - 1) / 32 * 32);
-        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
-                                                32 - 1) / 32 * 32);
-    }
-    __aicore__ inline void copy_in() {
-        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
-        const size_t elem_per_block = 32 / sizeof(SRC_T);
-        size_t tail = num_elem % elem_per_block;
-        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
-        DataCopy(src_local, src_gm, cpy_elements_len);
-        src_queue.EnQue(src_local);
-    }
-    __aicore__ inline void copy_out() {
-        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
-#ifdef ASCEND_310P
-        const size_t elem_per_block = 32 / sizeof(DST_T);
-        size_t tail = num_elem % elem_per_block;
-        size_t len = num_elem & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(dst_gm, dst_local, len);
-        }
-        if(tail != 0) {
-            for (size_t i = tail; i < elem_per_block; i++) {
-                dst_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
-            SetAtomicNone();
-        }
-#else
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
-        DataCopyPad(dst_gm, dst_local, dataCopyParams);
-#endif
-        dst_queue.FreeTensor(dst_local);
-    }
-    __aicore__ inline void dup() {
-        // main process, copy one row data from src to dst.
-        copy_in();
-        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
-        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-        int32_t BLOCK_NUM = 32 / sizeof(DST_T);
-        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
-                                        / BLOCK_NUM * BLOCK_NUM);
-        dst_queue.EnQue<DST_T>(dst_local);
-        src_queue.FreeTensor(src_local);
-        copy_out();
-    }
-    __aicore__ inline void dup_with_cast() {
-        // main process, copy one row data from src to dst.
-        // cast dtype from src to dst.
-        copy_in();
-        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
-        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
-        dst_queue.EnQue<DST_T>(dst_local);
-        src_queue.FreeTensor(src_local);
-        copy_out();
-    }
-   private:
-    TPipe pipe;
-    GlobalTensor<SRC_T> src_gm;
-    GlobalTensor<DST_T> dst_gm;
-    int64_t num_rows;
-    int64_t num_elem;
-    int64_t idx_ne3;
-    int64_t idx_ne2;
-    int64_t idx_ne1;
-    int64_t src_stride;
-    int64_t dst_stride;
-    TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
-};
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-    DupByRows<half, half> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup();
-}
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-    DupByRows<float, float> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup();
-}
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-    DupByRows<float, half> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast();
-}
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-    // copy params from gm to ub.
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-    DupByRows<half, float> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast();
-}

ggml/src/ggml-cann/kernels/get_row_f16.cpp DELETED Viewed

@@ -1,197 +0,0 @@
-#include "kernel_operator.h"
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-#define BUFFER_NUM 2
-class GET_ROW_F16 {
-   public:
-    __aicore__ inline GET_ROW_F16() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
-                                int64_t *output_ne_ub, size_t *output_nb_ub) {
-        // TODO, use template for F16/f32
-        int64_t op_block_num = GetBlockNum();
-        op_block_idx = GetBlockIdx();
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-        input_gm.SetGlobalBuffer((__gm__ half *)input);
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-        uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
-                                             & ~31);
-        uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
-                                              & ~31);
-        local_buffer_elems = input_local_buffer_size / sizeof(half);
-        // TODO, consider long row that can't put in UB.
-        // All data should asign to 32. It's ok because all data is align to 32.
-        pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
-        pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
-    }
-    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
-        size_t origin_len = len;
-        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        const size_t elem_per_block = 32 / sizeof(half);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if(tail != 0) {
-            len += elem_per_block;
-        }
-        DataCopy(input_local, input_gm[offset], len);
-        input_queue.EnQue(input_local);
-    }
-    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(output_gm[offset], output_local, len);
-        }
-        if(tail != 0) {
-#ifdef ASCEND_310P
-            for (size_t i = tail; i < elem_per_block; i++) {
-                output_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
-            SetAtomicNone();
-#else
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len],
-                        dataCopyParams);
-#endif
-        }
-        output_queue.FreeTensor(output_local);
-    }
-    __aicore__ inline void calculate_row(int64_t idx) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3];
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3];
-        copy_in(input_offset, input_ne[0]);
-        LocalTensor<half> input_local = input_queue.DeQue<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-        Cast(output_local, input_local, RoundMode::CAST_NONE,
-             local_buffer_elems);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset, input_ne[0]);
-        input_queue.FreeTensor(input_local);
-    }
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            calculate_row(i);
-        }
-    }
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-    int64_t output_ne[4];
-    size_t output_stride[4];
-    size_t local_buffer_elems;
-    int64_t ir;
-    int64_t dr;
-    TPipe pipe;
-    GlobalTensor<half> input_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    int64_t op_block_idx;
-};
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-extern "C" __global__ __aicore__ void ascendc_get_row_f16(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
-    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-    GET_ROW_F16 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
-            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}

ggml/src/ggml-cann/kernels/get_row_f32.cpp DELETED Viewed

@@ -1,190 +0,0 @@
-#include "kernel_operator.h"
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-#define BUFFER_NUM 2
-class GET_ROW_F32 {
-   public:
-    __aicore__ inline GET_ROW_F32() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
-                                int64_t *output_ne_ub, size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        op_block_idx = GetBlockIdx();
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-        input_gm.SetGlobalBuffer((__gm__ float *)input);
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-        uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
-        local_buffer_elems = local_buffer_size / sizeof(float);
-        // TODO, consider long row that can't put in UB.
-        // All data should asign to 32. It's ok because all data is align to 32.
-        pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
-        pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
-    }
-    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
-        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if(tail != 0) {
-            len += elem_per_block;
-        }
-        DataCopy(input_local, input_gm[offset], len);
-        input_queue.EnQue(input_local);
-    }
-    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(output_gm[offset], output_local, len);
-        }
-        if(tail != 0) {
-#ifdef ASCEND_310P
-            for (size_t i = tail; i < elem_per_block; i++) {
-                output_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
-            SetAtomicNone();
-#else
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len],
-                        dataCopyParams);
-#endif
-        }
-        output_queue.FreeTensor(output_local);
-    }
-    __aicore__ inline void calculate_row(int64_t idx) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3];
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3];
-        copy_in(input_offset, input_ne[0]);
-        LocalTensor<float> input_local = input_queue.DeQue<float>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-        DataCopy(output_local, input_local, local_buffer_elems);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset, input_ne[0]);
-        input_queue.FreeTensor(input_local);
-    }
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            calculate_row(i);
-        }
-    }
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-    int64_t output_ne[4];
-    size_t output_stride[4];
-    size_t local_buffer_elems;
-    int64_t ir;
-    int64_t dr;
-    TPipe pipe;
-    GlobalTensor<float> input_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    int64_t op_block_idx;
-};
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-extern "C" __global__ __aicore__ void ascendc_get_row_f32(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
-    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-    GET_ROW_F32 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
-            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}

ggml/src/ggml-cann/kernels/get_row_q4_0.cpp DELETED Viewed

@@ -1,204 +0,0 @@
-#include "kernel_operator.h"
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support 4bit get row
-    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
-        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support 4bit get row.\n");
-    }
-#else
-#define BUFFER_NUM 2
-#define QK4_0 32
-class GET_ROW_Q4_0 {
-   public:
-    __aicore__ inline GET_ROW_Q4_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
-                                size_t *indices_nb_ub, int64_t *output_ne_ub,
-                                size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            scale_ne[i] = input_ne_ub[i];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-        // one scale for a group.
-        scale_ne[0] /= QK4_0;
-        input_stride[0] = 1;
-        scale_stride[0] = 1;
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-        group_size_in_row = input_ne[0] / QK4_0;
-        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
-                               input_ne[3] / 2;
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-        input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
-        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
-    }
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
-        // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
-        DataCopy(input_local, input_gm[offset], QK4_0);
-        input_queue.EnQue(input_local);
-    }
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        DataCopy(output_gm[offset], output_local, QK4_0);
-        output_queue.FreeTensor(output_local);
-    }
-    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3] +
-                                     group * QK4_0;
-        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
-                                     indices_ne1_idx * scale_stride[2] +
-                                     indices_ne2_idx * scale_stride[3] + group;
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3] +
-                                      group * QK4_0;
-        copy_in(input_offset);
-        LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-        // TODO: cast more data to speed up.
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
-        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-        // Only mul need compile by group.
-        half scale = scale_gm.GetValue(scale_offset);
-        Muls(output_local, output_local, (float)scale, QK4_0);
-        input_queue.FreeTensor(input_local);
-        cast_queue.FreeTensor(cast_local);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-    }
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                calculate_group(i, j);
-            }
-        }
-    }
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-    int64_t scale_ne[4];
-    size_t scale_stride[4];
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-    int64_t output_ne[4];
-    size_t output_stride[4];
-    int64_t ir;
-    int64_t dr;
-    int64_t group_size_in_row;
-    TPipe pipe;
-    GlobalTensor<int4b_t> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-    GET_ROW_Q4_0 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
-            indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
-#endif // #ifdef ASCEND_310P

ggml/src/ggml-cann/kernels/get_row_q8_0.cpp DELETED Viewed

@@ -1,191 +0,0 @@
-#include "kernel_operator.h"
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-#define BUFFER_NUM 2
-#define QK8_0 32
-class GET_ROW_Q8_0 {
-   public:
-    __aicore__ inline GET_ROW_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
-                                size_t *indices_nb_ub, int64_t *output_ne_ub,
-                                size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            scale_ne[i] = input_ne_ub[i];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-        // one scale for a group.
-        scale_ne[0] /= QK8_0;
-        input_stride[0] = 1;
-        scale_stride[0] = 1;
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-        group_size_in_row = input_ne[0] / QK8_0;
-        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
-                               input_ne[3] * sizeof(int8_t);
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-        input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
-    }
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3] +
-                                     group * QK8_0;
-        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
-                                     indices_ne1_idx * scale_stride[2] +
-                                     indices_ne2_idx * scale_stride[3] + group;
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3] +
-                                      group * QK8_0;
-        copy_in(input_offset);
-        LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-        // TODO: cast more data to speed up.
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
-        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
-        // Only mul need compile by group.
-        half scale = scale_gm.GetValue(scale_offset);
-        Muls(output_local, output_local, (float)scale, QK8_0);
-        input_queue.FreeTensor(input_local);
-        cast_queue.FreeTensor(cast_local);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-    }
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                calculate_group(i, j);
-            }
-        }
-    }
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-    int64_t scale_ne[4];
-    size_t scale_stride[4];
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-    int64_t output_ne[4];
-    size_t output_stride[4];
-    int64_t ir;
-    int64_t dr;
-    int64_t group_size_in_row;
-    TPipe pipe;
-    GlobalTensor<int8_t> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-    GET_ROW_Q8_0 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
-            indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}

ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp DELETED Viewed

@@ -1,218 +0,0 @@
-#include "kernel_operator.h"
-using namespace AscendC;
-#ifdef ASCEND_310P
-    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f16->8bit quantization.\n");
-    }
-#else
-#define BUFFER_NUM 2
-#define QK8_0 32
-class QUANTIZE_F16_Q8_0 {
-   public:
-    __aicore__ inline QUANTIZE_F16_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-            output_ne[i] = output_ne_ub[i];
-        }
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / QK8_0;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-        group_size_in_row = scale_stride[1];
-        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t);
-        input_gm.SetGlobalBuffer((__gm__ half *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
-                                                 group_size_in_row *
-                                                 sizeof(half)));
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(work_queue, 1, 32);
-        pipe.InitBuffer(max_queue, 1, 32);
-        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
-        pipe.InitBuffer(scale_queue, 1, 32);
-        pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
-    }
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + QK8_0 * group;
-        const int64_t output_offset = i1 * output_stride[1] +
-                                      i2 * output_stride[2] +
-                                      i3 * output_stride[3] + QK8_0 * group;
-        copy_in(input_offset);
-        LocalTensor<half> input_local = input_queue.DeQue<half>();
-        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
-        Abs(abs_local, cast_local, QK8_0);
-        ReduceMax(max_local, abs_local, work_local, QK8_0);
-        pipe_barrier(PIPE_ALL);
-        float d = max_local.GetValue(0);
-        d = d / ((1 << 7) - 1);
-        if (d != 0) {
-            Muls(cast_local, cast_local, 1.0f / d, QK8_0);
-        }
-        Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        abs_queue.FreeTensor(abs_local);
-        max_queue.FreeTensor(max_local);
-        cast_queue.FreeTensor(cast_local);
-        return (half)d;
-    }
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
-                }
-            }
-        }
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-    int64_t output_ne[4];
-    size_t output_stride[4];
-    int64_t group_size_in_row;
-    int64_t ir;
-    int64_t dr;
-    TPipe pipe;
-    GlobalTensor<half> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, 1> work_queue;
-    TQue<QuePosition::VECOUT, 1> max_queue;
-    TQue<QuePosition::VECIN, 1> abs_queue;
-    TQue<QuePosition::VECOUT, 1> scale_queue;
-    TQue<QuePosition::VECOUT, 1> cast_queue;
-};
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    QUANTIZE_F16_Q8_0 op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-#endif // #ifdef ASCEND_310P

ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp DELETED Viewed

@@ -1,216 +0,0 @@
-#include "kernel_operator.h"
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support f32->8bit quantization
-    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f32->8bit quantization.\n");
-    }
-#else
-#define BUFFER_NUM 2
-#define QK8_0 32
-class QUANTIZE_F32_Q8_0 {
-   public:
-    __aicore__ inline QUANTIZE_F32_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-            output_ne[i] = output_ne_ub[i];
-        }
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / QK8_0;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-        group_size_in_row = scale_stride[1];
-        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t);
-        input_gm.SetGlobalBuffer((__gm__ float *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
-                                                 ir * group_size_in_row *
-                                                 sizeof(half)));
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(work_queue, 1, 32);
-        pipe.InitBuffer(max_queue, 1, 32);
-        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
-        pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
-        pipe.InitBuffer(scale_queue, 1, 32);
-    }
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + QK8_0 * group;
-        const int64_t output_offset = i1 * output_stride[1] +
-                                      i2 * output_stride[2] +
-                                      i3 * output_stride[3] + QK8_0 * group;
-        copy_in(input_offset);
-        LocalTensor<float> input_local = input_queue.DeQue<float>();
-        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-        Abs(abs_local, input_local, QK8_0);
-        ReduceMax(max_local, abs_local, work_local, QK8_0);
-        pipe_barrier(PIPE_ALL);
-        float d = max_local.GetValue(0);
-        d = d / ((1 << 7) - 1);
-        if (d != 0) {
-            Muls(input_local, input_local, 1.0f / d, QK8_0);
-        }
-        Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        abs_queue.FreeTensor(abs_local);
-        max_queue.FreeTensor(max_local);
-        cast_queue.FreeTensor(cast_local);
-        return (half)d;
-    }
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
-                }
-            }
-        }
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-    int64_t output_ne[4];
-    size_t output_stride[4];
-    int64_t group_size_in_row;
-    int64_t ir;
-    int64_t dr;
-    TPipe pipe;
-    GlobalTensor<float> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, 1> work_queue;
-    TQue<QuePosition::VECOUT, 1> max_queue;
-    TQue<QuePosition::VECIN, 1> abs_queue;
-    TQue<QuePosition::VECIN, 1> cast_queue;
-    TQue<QuePosition::VECOUT, 1> scale_queue;
-};
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    QUANTIZE_F32_Q8_0 op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-#endif // #ifdef ASCEND_310P

ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp DELETED Viewed

@@ -1,295 +0,0 @@
-#include "kernel_operator.h"
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support float->4bit quantization
-    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f32->4bit quantization.\n");
-    }
-    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f16->4bit quantization.\n");
-    }
-#else
-#define BUFFER_NUM 2
-#define Group_Size 32
-template <typename SRC_T>
-class QUANTIZE_FLOAT_TO_Q4_0 {
-   public:
-    __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
-        //                         permute=[0,0,0,0]):
-        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-        // input stride of data elements
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-            output_ne[i] = output_ne_ub[i];
-        }
-        // output stride of data elements
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-        // scale saved one by one after data:. [group1_scale, group2_scale, ...]
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / Group_Size;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-        group_size_in_row = scale_stride[1];
-        int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t) / 2;
-        input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
-                                                 group_size_in_row *
-                                                 sizeof(half)));
-        pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
-        pipe.InitBuffer(output_queue, BUFFER_NUM,
-                            Group_Size * sizeof(int8_t) / 2);
-        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
-        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
-        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
-    }
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
-        DataCopy(input_local, input_gm[offset], Group_Size);
-        input_queue.EnQue(input_local);
-    }
-    __aicore__ inline void copy_out(uint32_t offset) {
-        // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
-        // and using DataCopyPad to avoid 32 bits align.
-        LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
-        LocalTensor<int8_t> output_int8_local =
-                                    output_local.ReinterpretCast<int8_t>();
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = Group_Size / 2  * sizeof(int8_t);
-        DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
-        output_queue.FreeTensor(output_local);
-    }
-    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
-                                         LocalTensor<float> input_local) {
-        DataCopy(cast_local, input_local, Group_Size);
-    }
-    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
-                                         LocalTensor<half> input_local) {
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
-    }
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + Group_Size * group;
-        // output_offset is stride for output_gm which datatype is int8_t and
-        // divided by 2 is needed for int4b_t.
-        const int64_t output_offset = (i1 * output_stride[1] +
-                                       i2 * output_stride[2] +
-                                       i3 * output_stride[3] +
-                                       Group_Size * group) / 2;
-        copy_in(input_offset);
-        LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
-        LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
-        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<float> min_local = min_queue.AllocTensor<float>();
-        LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
-        LocalTensor<half> half_local = half_queue.AllocTensor<half>();
-        input_to_cast(cast_local, input_local);
-        ReduceMax(max_local, cast_local, work_local, Group_Size);
-        ReduceMin(min_local, cast_local, work_local, Group_Size);
-        const float max_value = max_local.GetValue(0);
-        const float min_value = min_local.GetValue(0);
-        float d = max_value;
-        if (min_value < 0 && (-1 * min_value) > max_value) {
-            d = min_value;
-        }
-        d = d / (-8);
-        if (d != 0) {
-            Muls(cast_local, cast_local, 1.0f / d, Group_Size);
-        }
-        // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
-        float scalar = 8.5f;
-        Adds(cast_local, cast_local, scalar, Group_Size);
-        Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
-        scalar = 15.0f;
-        Mins(cast_local, cast_local, scalar, Group_Size);
-        scalar = -8.0f;
-        Adds(cast_local, cast_local, scalar, Group_Size);
-        // float->half->int4b
-        Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
-        Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        max_queue.FreeTensor(max_local);
-        min_queue.FreeTensor(min_local);
-        int8_queue.FreeTensor(int8_local);
-        half_queue.FreeTensor(half_local);
-        cast_queue.FreeTensor(cast_local);
-        return (half)d;
-    }
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                // Copy Group_Size/2 length data each time.
-                if (scale_local_offset == Group_Size / 2) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local,
-                                      Group_Size / 2);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += Group_Size / 2;
-                }
-            }
-        }
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-        scale_queue.FreeTensor(scale_local);
-    }
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-    int64_t output_ne[4];
-    size_t output_stride[4];
-    int64_t group_size_in_row;
-    int64_t ir;
-    int64_t dr;
-    TPipe pipe;
-    GlobalTensor<SRC_T> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
-};
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    QUANTIZE_FLOAT_TO_Q4_0<half> op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    QUANTIZE_FLOAT_TO_Q4_0<float> op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-#endif // #ifdef ASCEND_310P

ggml/src/ggml-kompute/CMakeLists.txt DELETED Viewed

@@ -1,166 +0,0 @@
-find_package(Vulkan COMPONENTS glslc REQUIRED)
-find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
-if (NOT glslc_executable)
-    message(FATAL_ERROR "glslc not found")
-endif()
-ggml_add_backend_library(ggml-kompute
-                         ggml-kompute.cpp
-                         ../../include/ggml-kompute.h
-                        )
-target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
-target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
-function(compile_shader)
-    set(options)
-    set(oneValueArgs)
-    set(multiValueArgs SOURCES)
-    cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    foreach(source ${compile_shader_SOURCES})
-        get_filename_component(filename ${source} NAME)
-        set(spv_file ${filename}.spv)
-        add_custom_command(
-            OUTPUT ${spv_file}
-            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
-            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
-            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
-            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
-            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-            COMMENT "Compiling ${source} to ${spv_file}"
-            )
-        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
-        set(FILE_NAME "shader${RAW_FILE_NAME}")
-        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
-        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
-        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
-        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
-        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-        if(CMAKE_GENERATOR MATCHES "Visual Studio")
-            add_custom_command(
-                OUTPUT ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                DEPENDS ${spv_file} xxd
-                COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
-                )
-        else()
-            add_custom_command(
-                OUTPUT ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                DEPENDS ${spv_file} xxd
-                COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
-                )
-        endif()
-    endforeach()
-endfunction()
-if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
-    message(STATUS "Kompute found")
-    set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
-    add_subdirectory(kompute)
-    # Compile our shaders
-    compile_shader(SOURCES
-        kompute-shaders/op_scale.comp
-        kompute-shaders/op_scale_8.comp
-        kompute-shaders/op_add.comp
-        kompute-shaders/op_addrow.comp
-        kompute-shaders/op_mul.comp
-        kompute-shaders/op_silu.comp
-        kompute-shaders/op_relu.comp
-        kompute-shaders/op_gelu.comp
-        kompute-shaders/op_softmax.comp
-        kompute-shaders/op_norm.comp
-        kompute-shaders/op_rmsnorm.comp
-        kompute-shaders/op_diagmask.comp
-        kompute-shaders/op_mul_mat_mat_f32.comp
-        kompute-shaders/op_mul_mat_f16.comp
-        kompute-shaders/op_mul_mat_q8_0.comp
-        kompute-shaders/op_mul_mat_q4_0.comp
-        kompute-shaders/op_mul_mat_q4_1.comp
-        kompute-shaders/op_mul_mat_q4_k.comp
-        kompute-shaders/op_mul_mat_q6_k.comp
-        kompute-shaders/op_getrows_f32.comp
-        kompute-shaders/op_getrows_f16.comp
-        kompute-shaders/op_getrows_q4_0.comp
-        kompute-shaders/op_getrows_q4_1.comp
-        kompute-shaders/op_getrows_q6_k.comp
-        kompute-shaders/op_rope_norm_f16.comp
-        kompute-shaders/op_rope_norm_f32.comp
-        kompute-shaders/op_rope_neox_f16.comp
-        kompute-shaders/op_rope_neox_f32.comp
-        kompute-shaders/op_cpy_f16_f16.comp
-        kompute-shaders/op_cpy_f16_f32.comp
-        kompute-shaders/op_cpy_f32_f16.comp
-        kompute-shaders/op_cpy_f32_f32.comp
-    )
-    # Create a custom target for our generated shaders
-    add_custom_target(generated_shaders DEPENDS
-        shaderop_scale.h
-        shaderop_scale_8.h
-        shaderop_add.h
-        shaderop_addrow.h
-        shaderop_mul.h
-        shaderop_silu.h
-        shaderop_relu.h
-        shaderop_gelu.h
-        shaderop_softmax.h
-        shaderop_norm.h
-        shaderop_rmsnorm.h
-        shaderop_diagmask.h
-        shaderop_mul_mat_mat_f32.h
-        shaderop_mul_mat_f16.h
-        shaderop_mul_mat_q8_0.h
-        shaderop_mul_mat_q4_0.h
-        shaderop_mul_mat_q4_1.h
-        shaderop_mul_mat_q4_k.h
-        shaderop_mul_mat_q6_k.h
-        shaderop_getrows_f32.h
-        shaderop_getrows_f16.h
-        shaderop_getrows_q4_0.h
-        shaderop_getrows_q4_1.h
-        shaderop_getrows_q6_k.h
-        shaderop_rope_norm_f16.h
-        shaderop_rope_norm_f32.h
-        shaderop_rope_neox_f16.h
-        shaderop_rope_neox_f32.h
-        shaderop_cpy_f16_f16.h
-        shaderop_cpy_f16_f32.h
-        shaderop_cpy_f32_f16.h
-        shaderop_cpy_f32_f32.h
-    )
-    # Create a custom command that depends on the generated_shaders
-    add_custom_command(
-        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-        COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-        DEPENDS generated_shaders
-        COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
-    )
-    # Add the stamp to the main sources to ensure dependency tracking
-    target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-else()
-    message(WARNING "Kompute not found")
-endif()

ggml/src/ggml-kompute/ggml-kompute.cpp DELETED Viewed

@@ -1,2251 +0,0 @@
-#include "ggml-impl.h"
-#include "ggml-backend.h"
-#include "ggml-backend-impl.h"
-#include "ggml-kompute.h"
-// These are generated at build time by cmake custom command
-#include "shaderop_scale.h"
-#include "shaderop_scale_8.h"
-#include "shaderop_add.h"
-#include "shaderop_addrow.h"
-#include "shaderop_mul.h"
-#include "shaderop_silu.h"
-#include "shaderop_relu.h"
-#include "shaderop_gelu.h"
-#include "shaderop_softmax.h"
-#include "shaderop_norm.h"
-#include "shaderop_rmsnorm.h"
-#include "shaderop_diagmask.h"
-#include "shaderop_mul_mat_f16.h"
-#include "shaderop_mul_mat_q8_0.h"
-#include "shaderop_mul_mat_q4_0.h"
-#include "shaderop_mul_mat_q4_1.h"
-#include "shaderop_mul_mat_q4_k.h"
-#include "shaderop_mul_mat_q6_k.h"
-#include "shaderop_mul_mat_mat_f32.h"
-#include "shaderop_getrows_f32.h"
-#include "shaderop_getrows_f16.h"
-#include "shaderop_getrows_q4_0.h"
-#include "shaderop_getrows_q4_1.h"
-#include "shaderop_getrows_q6_k.h"
-#include "shaderop_rope_norm_f16.h"
-#include "shaderop_rope_norm_f32.h"
-#include "shaderop_rope_neox_f16.h"
-#include "shaderop_rope_neox_f32.h"
-#include "shaderop_cpy_f16_f16.h"
-#include "shaderop_cpy_f16_f32.h"
-#include "shaderop_cpy_f32_f16.h"
-#include "shaderop_cpy_f32_f32.h"
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include <kompute/Kompute.hpp>
-#include <vulkan/vulkan.hpp>
-#ifdef __linux__
-#include <cstdlib> // for setenv
-#endif
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-#define QK_NL 16
-typedef ggml_fp16_t half;
-static std::string ggml_kompute_format_name(int device) {
-    return "Kompute" + std::to_string(device);
-}
-struct ggml_kompute_context {
-    int device;
-    std::string name;
-    std::shared_ptr<vk::DescriptorPool> pool;
-    ggml_kompute_context(int device)
-        : device(device), name(ggml_kompute_format_name(device)) {}
-};
-// FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
-// and consolidate the init functions and simplify object lifetime management. As it currently stands,
-// we *have* to have the kompute manager no matter what for device discovery, but the kompute context
-// is only created when a device is set and vulkan is explicitly turned on.
-static ggml_kompute_context *s_kompute_context = nullptr;
-class kompute_manager {
-    kp::Manager *s_mgr = nullptr;
-public:
-    kp::Manager *operator()() {
-        if (s_mgr && !s_mgr->hasInstance()) {
-            destroy();
-        }
-        if (!s_mgr) {
-            s_mgr = new kp::Manager;
-        }
-        return s_mgr;
-    }
-    void destroy() {
-        delete s_mgr;
-        s_mgr = nullptr;
-    }
-};
-static kompute_manager komputeManager;
-struct ggml_vk_memory {
-    void *data = nullptr;
-    size_t size = 0;
-    vk::DeviceMemory *primaryMemory = nullptr;
-    vk::Buffer *primaryBuffer = nullptr;
-    vk::DeviceMemory *stagingMemory = nullptr;
-    vk::Buffer *stagingBuffer = nullptr;
-};
-#ifdef __linux__
-__attribute__((constructor))
-static void enable_sam() {
-    setenv("RADV_PERFTEST", "sam", false);
-}
-#endif
-static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physical_device) {
-    vk::PhysicalDeviceFeatures availableFeatures;
-    physical_device.getFeatures(&availableFeatures);
-    if (!availableFeatures.shaderInt16)
-        return false;
-    vk::PhysicalDeviceVulkan11Features availableFeatures11;
-    vk::PhysicalDeviceVulkan12Features availableFeatures12;
-    availableFeatures11.pNext = &availableFeatures12;
-    availableFeatures12.pNext = nullptr;
-    vk::PhysicalDeviceFeatures2 features2;
-    features2.pNext = &availableFeatures11;
-    physical_device.getFeatures2(&features2);
-    if (!availableFeatures11.uniformAndStorageBuffer16BitAccess ||
-        !availableFeatures11.storageBuffer16BitAccess) {
-        return false;
-    }
-    if (!availableFeatures12.storageBuffer8BitAccess ||
-        !availableFeatures12.uniformAndStorageBuffer8BitAccess ||
-        !availableFeatures12.shaderFloat16 ||
-        !availableFeatures12.shaderInt8) {
-        return false;
-    }
-    return true;
-}
-static const char * ggml_vk_getVendorName(uint32_t vendorID) {
-    switch (vendorID) {
-        case 0x10DE:
-            return "nvidia";
-        case 0x1002:
-            return "amd";
-        case 0x8086:
-            return "intel";
-        default:
-            return "unknown";
-    }
-}
-static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t memoryRequired) {
-    std::vector<ggml_vk_device> results;
-    if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
-        return results;
-    std::vector<vk::PhysicalDevice> physical_devices;
-    try {
-        physical_devices = komputeManager()->listDevices();
-    } catch (vk::SystemError & err) {
-        std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n";
-        return results;
-    }
-    uint32_t deviceCount = physical_devices.size();
-    if (deviceCount == 0)
-        return results;
-    std::unordered_map<std::string, size_t> count_by_name;
-    for (uint32_t i = 0; i < deviceCount; i++) {
-        const auto & physical_device = physical_devices[i];
-        VkPhysicalDeviceProperties dev_props = physical_device.getProperties();
-        VkPhysicalDeviceMemoryProperties memoryProperties = physical_device.getMemoryProperties();
-        const uint32_t major = VK_VERSION_MAJOR(dev_props.apiVersion);
-        const uint32_t minor = VK_VERSION_MINOR(dev_props.apiVersion);
-        if (major < 1 || minor < 2)
-            continue;
-        if (!ggml_vk_checkPhysicalDeviceFeatures(physical_device))
-            continue;
-        size_t heapSize = 0;
-        for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) {
-            VkMemoryHeap heap = memoryProperties.memoryHeaps[j];
-            if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
-                heapSize = heap.size;
-                break;
-            }
-        }
-        if (heapSize < memoryRequired)
-            continue;
-        auto ext_props = physical_device.enumerateDeviceExtensionProperties();
-        bool has_maintenance4 = false;
-        // Check if maintenance4 is supported
-        for (const auto & properties : ext_props) {
-            if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
-                has_maintenance4 = true;
-            }
-        }
-        vk::PhysicalDeviceSubgroupProperties subgroup_props;
-        vk::PhysicalDeviceProperties2 dev_props2;
-        vk::PhysicalDeviceMaintenance3Properties dev_props3;
-        vk::PhysicalDeviceMaintenance4Properties dev_props4;
-        dev_props2.pNext = &dev_props3;
-        dev_props3.pNext = &subgroup_props;
-        if (has_maintenance4) {
-            subgroup_props.pNext = &dev_props4;
-        }
-        physical_device.getProperties2(&dev_props2);
-        if (subgroup_props.subgroupSize < 32)
-            continue;
-        ggml_vk_device d;
-        d.index = i;
-        d.type = dev_props.deviceType;
-        d.heapSize = heapSize;
-        d.vendor = strdup(ggml_vk_getVendorName(dev_props.vendorID));
-        d.subgroupSize = subgroup_props.subgroupSize;
-        d.bufferAlignment = dev_props.limits.minStorageBufferOffsetAlignment;
-        if (has_maintenance4) {
-            d.maxAlloc = std::min(dev_props3.maxMemoryAllocationSize, dev_props4.maxBufferSize);
-        } else {
-            d.maxAlloc = dev_props3.maxMemoryAllocationSize;
-        }
-        std::string name(dev_props.deviceName);
-        size_t n_idx = ++count_by_name[name];
-        if (n_idx > 1) {
-            name += " (" + std::to_string(n_idx) + ")";
-        }
-        d.name = strdup(name.c_str());
-        results.push_back(d);
-    }
-    std::stable_sort(results.begin(), results.end(),
-        [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool {
-            if (lhs.type != rhs.type) {
-                if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true;
-                if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false;
-                if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true;
-                if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false;
-            }
-            return lhs.heapSize < rhs.heapSize;
-        }
-    );
-    return results;
-}
-static std::vector<ggml_vk_device>& ggml_vk_available_devices() {
-    static std::vector<ggml_vk_device> devices = ggml_vk_available_devices_internal(0);
-    return devices;
-}
-static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
-    devices.erase(
-        std::remove_if(devices.begin(), devices.end(),
-            [&targetVendor](const ggml_vk_device& device) {
-                return device.vendor != targetVendor;
-            }),
-        devices.end()
-    );
-}
-static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std::string& targetName) {
-    devices.erase(
-        std::remove_if(devices.begin(), devices.end(),
-            [&targetName](const ggml_vk_device& device) {
-                return device.name != targetName;
-            }),
-        devices.end()
-    );
-}
-static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) {
-    if (name.empty())
-        return false;
-    auto devices = ggml_vk_available_devices_internal(memoryRequired);
-    if (name == "amd" || name == "nvidia" || name == "intel") {
-        ggml_vk_filterByVendor(devices, name);
-    } else if (name != "gpu") {
-        ggml_vk_filterByName(devices, name);
-    }
-    if (devices.empty())
-        return false;
-    *device = devices.front();
-    return true;
-}
-bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) {
-    return ggml_vk_get_device(device, memoryRequired, std::string(name));
-}
-bool ggml_vk_has_vulkan() {
-    return komputeManager()->hasVulkan();
-}
-bool ggml_vk_has_device() {
-    return komputeManager()->hasDevice();
-}
-ggml_vk_device ggml_vk_current_device() {
-    if (!komputeManager()->hasDevice())
-        return ggml_vk_device();
-    auto devices = ggml_vk_available_devices();
-    ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
-    GGML_ASSERT(!devices.empty());
-    return devices.front();
-}
-static
-void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
-    std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
-        vk::DescriptorPoolSize(
-          vk::DescriptorType::eStorageBuffer,
-          4 * size // Descriptor count is number of possible tensors to pass into an algorithm
-          )
-    };
-    vk::DescriptorPoolCreateInfo descriptorPoolInfo(
-      vk::DescriptorPoolCreateFlags(),
-      size, // Max sets
-      static_cast<uint32_t>(descriptorPoolSizes.size()),
-      descriptorPoolSizes.data());
-    ctx->pool = std::make_shared<vk::DescriptorPool>();
-    vk::Result r = komputeManager()->device()->createDescriptorPool(
-      &descriptorPoolInfo, nullptr, ctx->pool.get());
-    if (r != vk::Result::eSuccess)
-        std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
-}
-static
-void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
-    if (ctx->pool) {
-        komputeManager()->device()->destroy(
-          *ctx->pool,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        ctx->pool = nullptr;
-    }
-}
-static
-vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
-    vk::BufferCreateInfo bufferCreateInfo;
-    bufferCreateInfo.size = size;
-    bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer |
-                             vk::BufferUsageFlagBits::eTransferSrc |
-                             vk::BufferUsageFlagBits::eTransferDst;
-    bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
-    vk::Buffer *vkBuffer = new vk::Buffer;
-    vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
-    if (r != vk::Result::eSuccess)
-        std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl;
-    return vkBuffer;
-}
-static
-vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) {
-    uint32_t memoryTypeIndex = -1;
-    bool memoryTypeIndexFound = false;
-    vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties();
-    for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
-        const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i];
-        const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex];
-        if (memoryHeap.size < size) {
-            continue;
-        }
-        if (requirements.memoryTypeBits & (1 << i)) {
-            if (((memoryProperties.memoryTypes[i]).propertyFlags &
-                 flags) == flags) {
-                memoryTypeIndex = i;
-                memoryTypeIndexFound = true;
-                if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) {
-                    *isHostVisible = true;
-                }
-                break;
-            }
-        }
-    }
-    if (!memoryTypeIndexFound) {
-        throw std::runtime_error(
-          "Memory type index for buffer creation not found");
-    }
-    vk::MemoryAllocateInfo allocInfo;
-    allocInfo.allocationSize = size;
-    allocInfo.memoryTypeIndex = memoryTypeIndex;
-    vk::DeviceMemory *vkDeviceMemory =  new vk::DeviceMemory;
-    vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
-    if (r != vk::Result::eSuccess) {
-        std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl;
-        throw std::runtime_error("Error allocating vulkan memory.");
-    }
-    return vkDeviceMemory;
-}
-static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) {
-    size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer);
-    // If offset is already aligned, return it directly
-    if (offset % minStorageBufferOffsetAlignment == 0) {
-        return offset;
-    }
-    // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset
-    return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
-}
-static ggml_vk_memory ggml_vk_allocate(size_t size) {
-    ggml_vk_memory memory;
-    bool isHostVisible = false;
-    {
-        memory.primaryBuffer = ggml_vk_allocate_buffer(size);
-        vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer);
-        vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
-        memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
-        komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
-        if (isHostVisible) {
-            vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
-            if (r != vk::Result::eSuccess)
-                std::cerr << "Error mapping memory" << vk::to_string(r);
-        }
-    }
-    if (!isHostVisible) {
-        memory.stagingBuffer = ggml_vk_allocate_buffer(size);
-        vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer);
-        vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
-                                                      vk::MemoryPropertyFlagBits::eHostCoherent |
-                                                      vk::MemoryPropertyFlagBits::eHostCached;
-        memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
-        komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
-        vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
-        if (r != vk::Result::eSuccess)
-            std::cerr << "Error mapping memory" << vk::to_string(r);
-    }
-    memory.size = size;
-    return memory;
-}
-static void ggml_vk_free_memory(ggml_vk_memory &memory)
-{
-    komputeManager()->device()->destroy(
-      *memory.primaryBuffer,
-      (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-    if (memory.stagingBuffer) {
-        komputeManager()->device()->destroy(
-          *memory.stagingBuffer,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-    }
-    komputeManager()->device()->freeMemory(
-      *memory.primaryMemory,
-      (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-    if (memory.stagingMemory) {
-        komputeManager()->device()->freeMemory(
-          *memory.stagingMemory,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-    }
-}
-static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft);
-static
-ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
-    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
-    // compatibility with ggml-backend
-    GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name);
-    ggml_vk_memory * buf_ctx = static_cast<ggml_vk_memory *>(buffer->context);
-    const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data);
-    GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size));
-    offset = uint64_t(ioffs);
-    return buf_ctx;
-}
-static
-const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
-    uint64_t originalOffset = 0;
-    auto * res = ggml_vk_find_tensor(t, originalOffset);
-    if (!res) {
-        static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
-        return nullTensor;
-    }
-    // Create a tensor whose memory will be composed of our buffers at the correct offset
-    const size_t nelements = ggml_nelements(t);
-    size_t nbytes = ggml_nbytes(t);
-    size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset);
-    if (alignedOffset) {
-        *alignedOffset = originalOffset - vulkanOffset;
-        nbytes += *alignedOffset;
-    }
-    return komputeManager()->tensor(
-        t->data,
-        nelements,
-        nbytes, kp::Tensor::TensorDataTypes::eFloat,
-        res->primaryMemory, res->primaryBuffer,
-        res->stagingMemory, res->stagingBuffer,
-        vulkanOffset);
-}
-static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
-    if (size % sizeof(uint32_t) != 0) {
-        throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
-    }
-    const uint32_t* data_ptr = reinterpret_cast<const uint32_t*>(rawData);
-    size_t count = size / sizeof(uint32_t);
-    return std::vector<uint32_t>(data_ptr, data_ptr + count);
-}
-inline static
-uint32_t safe_divide(uint32_t a, uint32_t b) {
-    if (b <= 1) {
-        return a;
-    }
-    if ((a % b) != 0) {
-        fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
-        GGML_ABORT("safe_divide result would've had remainder");
-    }
-    return a / b;
-}
-static void ggml_vk_add(
-    kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& inA,
-    const std::shared_ptr<kp::Tensor>& inB,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
-    int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
-    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
-    int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
-    int32_t ne0,
-    int32_t nb0,  int32_t nb1,  int32_t nb2,  int32_t nb3
-) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
-        kp::shader_data::op_add_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00;
-        int32_t nb00, nb01, nb02, nb03;
-        int32_t ne10, ne11, ne12, ne13;
-        int32_t nb10, nb11, nb12, nb13;
-        int32_t ne0;
-        int32_t nb0, nb1, nb2, nb3;
-    } const pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00,
-        nb00, nb01, nb02, nb03,
-        ne10, ne11, ne12, ne13,
-        nb10, nb11, nb12, nb13,
-        ne0,
-        nb0, nb1, nb2, nb3
-    };
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_addrow(kp::Sequence& seq,
-                 const std::shared_ptr<kp::Tensor>& inA,
-                 const std::shared_ptr<kp::Tensor>& inB,
-                 const std::shared_ptr<kp::Tensor>& out,
-                 uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                 uint32_t size, uint32_t row = 0) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv,
-        kp::shader_data::op_addrow_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        uint32_t row;
-    } const pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        row
-    };
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({size});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_mul(
-    kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& inA,
-    const std::shared_ptr<kp::Tensor>& inB,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
-    int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
-    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
-    int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
-    int32_t ne0,
-    int32_t nb0,  int32_t nb1,  int32_t nb2,  int32_t nb3
-) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv,
-        kp::shader_data::op_mul_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00;
-        int32_t nb00, nb01, nb02, nb03;
-        int32_t ne10, ne11, ne12, ne13;
-        int32_t nb10, nb11, nb12, nb13;
-        int32_t ne0;
-        int32_t nb0, nb1, nb2, nb3;
-    } const pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00,
-        nb00, nb01, nb02, nb03,
-        ne10, ne11, ne12, ne13,
-        nb10, nb11, nb12, nb13,
-        ne0,
-        nb0, nb1, nb2, nb3
-    };
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_scale(kp::Sequence& seq,
-                   const std::shared_ptr<kp::Tensor>& in,
-                   const std::shared_ptr<kp::Tensor>& out,
-                   uint32_t inOff, uint32_t outOff,
-                   uint32_t size, float scale) {
-    const static auto spirv_1 = getSpirvShader(
-        kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len
-    );
-    const static auto spirv_8 = getSpirvShader(
-        kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len
-    );
-    struct PushConstants {
-        uint32_t inOff, outOff;
-        float scale;
-    } const pushConsts {
-        safe_divide(inOff, 4), safe_divide(outOff, 4),
-        scale
-    };
-    const auto * spirv = &spirv_1;
-    std::string name(__func__);
-    if (size % 8 == 0) {
-        size /= 8;
-        name += "_8";
-        spirv = &spirv_8;
-    }
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({in, out});
-        s_algo->setWorkgroup({size});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_xxlu(
-    const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& in,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inOff, uint32_t outOff,
-    uint32_t size
-) {
-    struct PushConstants {
-        uint32_t inOff, outOff;
-    } const pushConsts {
-        safe_divide(inOff, 4), safe_divide(outOff, 4),
-    };
-    auto name = std::string(__func__) + "_" + suffix;
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({in, out});
-        s_algo->setWorkgroup({size});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-template <typename... Args>
-static void ggml_vk_silu(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
-        kp::shader_data::op_silu_comp_spv_len);
-    ggml_vk_xxlu(spirv, "silu", std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_relu(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
-        kp::shader_data::op_relu_comp_spv_len);
-    ggml_vk_xxlu(spirv, "relu", std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_gelu(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
-        kp::shader_data::op_gelu_comp_spv_len);
-    ggml_vk_xxlu(spirv, "gelu", std::forward<Args>(args)...);
-}
-static void ggml_vk_soft_max(
-    kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& inA,
-    const std::shared_ptr<kp::Tensor>& inB,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
-    float scale, float max_bias, float m0, float m1,
-    uint32_t n_head_log2
-) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
-        kp::shader_data::op_softmax_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02;
-        float scale, max_bias, m0, m1;
-        uint32_t n_head_log2;
-        int32_t mask;
-    } pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02,
-        scale, max_bias, m0, m1,
-        n_head_log2,
-        bool(inB)
-    };
-    auto & inB_ = inB ? inB : inA;
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device
-        const uint32_t local_x = 32;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB_, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_norm_(
-    const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& in,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inOff, uint32_t outOff,
-    int32_t ne00, int32_t nb01,
-    int32_t nrows, float epsilon
-) {
-    GGML_ASSERT(nb01%sizeof(float) == 0);
-    GGML_ASSERT(ne00%sizeof(float) == 0);
-    struct PushConstants {
-        uint32_t inOff, outOff;
-        uint32_t ne00, nb01;
-        float eps;
-    } pushConsts {
-        safe_divide(inOff, 4), safe_divide(outOff, 4),
-        (uint32_t)ne00, (uint32_t)nb01, epsilon
-    };
-    auto name = std::string(__func__) + "_" + suffix;
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({in, out});
-        s_algo->setWorkgroup({(uint32_t)nrows});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-template <typename... Args>
-static void ggml_vk_norm(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
-        kp::shader_data::op_norm_comp_spv_len);
-    ggml_vk_norm_(spirv, "norm", std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_rms_norm(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
-        kp::shader_data::op_rmsnorm_comp_spv_len);
-    ggml_vk_norm_(spirv, "rms", std::forward<Args>(args)...);
-}
-static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
-                           const std::shared_ptr<kp::Tensor>& in,
-                           const std::shared_ptr<kp::Tensor>& out,
-                           uint32_t inOff, uint32_t outOff,
-                           uint32_t n_past,
-                           int32_t ne00, int32_t ne01, int32_t ne02) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv,
-        kp::shader_data::op_diagmask_comp_spv_len);
-    struct PushConstants {
-        uint32_t inOff, outOff;
-        uint32_t n_past;
-        int32_t ne00, ne01;
-    } pushConsts {
-        safe_divide(inOff, 4), safe_divide(outOff, 4),
-        n_past,
-        ne00, ne01
-    };
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({in, out});
-        s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_mul_mat_f16(
-    kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& inA,
-    const std::shared_ptr<kp::Tensor>& inB,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02,
-    uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
-    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
-    uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
-    int32_t ne0, int32_t ne1,
-    uint32_t r2, uint32_t r3
-) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv,
-        kp::shader_data::op_mul_mat_f16_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02;
-        uint32_t nb00, nb01, nb02, nb03;
-        int32_t ne10, ne11, ne12;
-        uint32_t nb10, nb11, nb12, nb13;
-        int32_t ne0, ne1;
-        uint32_t r2, r3;
-    } pushConsts {
-        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02,
-        nb00, nb01, nb02, nb03,
-        ne10, ne11, ne12,
-        nb10, nb11, nb12, nb13,
-        ne0, ne1,
-        r2, r3
-    };
-    const unsigned ny = unsigned((ne11 + 4 - 1)/4);
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), ny, unsigned(ne12*ne13)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA,
-                         const std::shared_ptr<kp::Tensor>& inB,
-                         const std::shared_ptr<kp::Tensor>& out,
-                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         uint32_t nb1, uint32_t nb2) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv,
-        kp::shader_data::op_mul_mat_mat_f32_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02, ne11, ne12;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        uint32_t nb1, nb2;
-    } pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, ne11, ne12,
-        nb01, nb02, nb11, nb12,
-        nb1, nb2
-    };
-    const uint32_t local_x = ggml_vk_current_device().subgroupSize;
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
-        {inA, inB, out}, spirv,
-        {unsigned(ne01),
-         unsigned(ne11),
-         unsigned(std::max(ne12, ne02))
-         },
-        {local_x},
-        {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
-                              unsigned(ne11),
-                              unsigned(std::max(ne12, ne02)),
-                              });
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_mul_mat_impl(
-    const std::vector<uint32_t>& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& inA,
-    const std::shared_ptr<kp::Tensor>& inB,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02,
-    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
-    int32_t ne0, int32_t ne1,
-    uint32_t nb01, uint32_t nb02, uint32_t nb03,
-    uint32_t nb11, uint32_t nb12, uint32_t nb13,
-    uint32_t r2, uint32_t r3
-) {
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02;
-        int32_t ne10, ne12;
-        int32_t ne0, ne1;
-        uint32_t nb01, nb02, nb03;
-        uint32_t nb11, nb12, nb13;
-        uint32_t r2, r3;
-    } pushConsts {
-        safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02,
-        ne10, ne12,
-        ne0, ne1,
-        nb01, nb02, nb03,
-        nb11, nb12, nb13,
-        r2, r3
-    };
-    auto name = std::string(__func__) + "_" + suffix;
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-template <typename... Args>
-static void ggml_vk_mul_mat_q4_0(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
-        kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
-    ggml_vk_mul_mat_impl(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_mul_mat_q4_1(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
-        kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
-    ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_mul_mat_q8_0(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv,
-        kp::shader_data::op_mul_mat_q8_0_comp_spv_len);
-    ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
-}
-static void ggml_vk_mul_mat_q4_k(
-    kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& inA,
-    const std::shared_ptr<kp::Tensor>& inB,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02,
-    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
-    int32_t ne0, int32_t ne1,
-    uint32_t nb01, uint32_t nb02, uint32_t nb03,
-    uint32_t nb11, uint32_t nb12, uint32_t nb13,
-    uint32_t r2, uint32_t r3
-) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
-        kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
-        uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
-        uint32_t r2, r3;
-    } pushConsts {
-        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0, ne1, ne01, ne02, ne12,
-        nb01, nb02, nb03, nb11, nb12, nb13,
-        r2, r3
-    };
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_mul_mat_q6_k(
-    kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& inA,
-    const std::shared_ptr<kp::Tensor>& inB,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02,
-    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
-    int32_t ne0, int32_t ne1,
-    uint32_t nb01, uint32_t nb02, uint32_t nb03,
-    uint32_t nb11, uint32_t nb12, uint32_t nb13,
-    uint32_t r2, uint32_t r3
-) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
-        kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
-        uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
-        uint32_t r2, r3;
-    } pushConsts {
-        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0, ne1, ne01, ne02, ne12,
-        nb01, nb02, nb03, nb11, nb12, nb13,
-        r2, r3
-    };
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = 2;
-        const uint32_t local_y = ggml_vk_current_device().subgroupSize;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_get_rows(
-    const std::vector<uint32_t>& spirv,
-    const char * suffix,
-    unsigned element_size, unsigned qk,
-    kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& inA,
-    const std::shared_ptr<kp::Tensor>& inB,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t nb01, int32_t nb1,
-    uint32_t size
-) {
-    GGML_ASSERT(nb01%element_size == 0);
-    GGML_ASSERT(nb1%sizeof(float) == 0);
-    if (qk) GGML_ASSERT(ne00%qk == 0);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, nb01, nb1;
-    } pushConsts {
-        safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, nb01, nb1
-    };
-    auto name = std::string(__func__) + "_" + suffix;
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({size});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-template <typename... Args>
-static void ggml_vk_get_rows_f32(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
-        kp::shader_data::op_getrows_f32_comp_spv_len);
-    ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_get_rows_f16(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
-        kp::shader_data::op_getrows_f16_comp_spv_len);
-    ggml_vk_get_rows(spirv, "f16", sizeof(half), 0, std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_get_rows_q4_0(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
-        kp::shader_data::op_getrows_q4_0_comp_spv_len);
-    ggml_vk_get_rows(spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_get_rows_q4_1(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
-        kp::shader_data::op_getrows_q4_1_comp_spv_len);
-    ggml_vk_get_rows(spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_get_rows_q6_k(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
-        kp::shader_data::op_getrows_q6_k_comp_spv_len);
-    ggml_vk_get_rows(spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
-}
-static void ggml_vk_rope(
-    kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& inA,
-    const std::shared_ptr<kp::Tensor>& inB,
-    const std::shared_ptr<kp::Tensor>& inC,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
-    ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
-    float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
-    int32_t ne01, int32_t ne02, int32_t ne03,
-    uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
-    int32_t ne0,
-    uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
-) {
-    GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
-    static const auto spirv_norm_f16 = getSpirvShader(
-        kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
-    );
-    static const auto spirv_norm_f32 = getSpirvShader(
-        kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
-    );
-    static const auto spirv_neox_f16 = getSpirvShader(
-        kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
-    );
-    static const auto spirv_neox_f32 = getSpirvShader(
-        kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
-    );
-    int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
-    GGML_ASSERT(nb03 % type_size == 0);
-    GGML_ASSERT(nb02 % type_size == 0);
-    GGML_ASSERT(nb01 % type_size == 0);
-    GGML_ASSERT(nb00 % type_size == 0);
-    GGML_ASSERT(nb3  % type_size == 0);
-    GGML_ASSERT(nb2  % type_size == 0);
-    GGML_ASSERT(nb1  % type_size == 0);
-    GGML_ASSERT(nb0  % type_size == 0);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, inCOff, outOff;
-        int32_t n_dims, mode, n_ctx_orig;
-        float freq_base, freq_scale;
-        bool has_freq_factors;
-        float ext_factor, attn_factor, beta_fast, beta_slow;
-        uint32_t nb00, nb01, nb02, nb03;
-        int32_t ne0;
-        uint32_t nb0, nb1, nb2, nb3;
-    } pushConsts {
-        safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
-        n_dims, mode, n_ctx_orig,
-        freq_base, freq_scale,
-        has_freq_factors,
-        ext_factor, attn_factor, beta_fast, beta_slow,
-        nb00, nb01, nb02, nb03,
-        ne0,
-        nb0, nb1, nb2, nb3
-    };
-    auto & inC_ = inC ? inC : inA;
-    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-    const bool is_f16 = src0t == GGML_TYPE_F16;
-    auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
-        s_algo = komputeManager()->algorithm<float, PushConstants>(
-            name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
-            {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
-        );
-    } else {
-        s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({inA, inB, inC_, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-static void ggml_vk_cpy(
-    const std::vector<uint32_t>& spirv,
-    uint32_t in_element_size, uint32_t out_element_size,
-    kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& in,
-    const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
-    uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
-    int32_t ne0, int32_t ne1, int32_t ne2,
-    uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
-) {
-    struct PushConstants {
-        uint32_t inOff, outOff;
-        int32_t ne00, ne01, ne02;
-        uint32_t nb00, nb01, nb02, nb03;
-        int32_t ne0, ne1, ne2;
-        uint32_t nb0, nb1, nb2, nb3;
-    } pushConsts {
-        safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size),
-        ne00, ne01, ne02,
-        nb00, nb01, nb02, nb03,
-        ne0, ne1, ne2,
-        nb0, nb1, nb2, nb3
-    };
-    std::string name = std::string(__func__)
-                       + "_i_" + std::to_string(in_element_size)
-                       + "_o_" + std::to_string(out_element_size);
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({in, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-template <typename... Args>
-static void ggml_vk_cpy_f32_f16(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
-        kp::shader_data::op_cpy_f32_f16_comp_spv_len);
-    ggml_vk_cpy(spirv, 4, 2, std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_cpy_f32_f32(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
-        kp::shader_data::op_cpy_f32_f32_comp_spv_len);
-    ggml_vk_cpy(spirv, 4, 4, std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_cpy_f16_f16(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
-        kp::shader_data::op_cpy_f16_f16_comp_spv_len);
-    ggml_vk_cpy(spirv, 2, 2, std::forward<Args>(args)...);
-}
-template <typename... Args>
-static void ggml_vk_cpy_f16_f32(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
-        kp::shader_data::op_cpy_f16_f32_comp_spv_len);
-    ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...);
-}
-static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    int64_t n = ggml_nelements(op);
-    switch (op->op) {
-        case GGML_OP_UNARY:
-            if (n % 4 != 0) return false;
-            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_GELU:
-                    if (n % 8 != 0) return false;
-                    // fall through
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_SILU:
-                    return ggml_is_contiguous(op->src[0]);
-                default:
-                    ;
-            }
-            break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_ADD:
-        case GGML_OP_MUL:
-        case GGML_OP_SCALE:
-        case GGML_OP_SOFT_MAX:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_NORM:
-            return true;
-        case GGML_OP_ROPE:
-            {
-                const int mode = ((const int32_t *) op->op_params)[2];
-                if (mode & GGML_ROPE_TYPE_MROPE) {
-                    return false;
-                }
-                if (mode & GGML_ROPE_TYPE_VISION) {
-                    return false;
-                }
-                return true;
-            }
-        case GGML_OP_DUP:
-        case GGML_OP_CPY:
-        case GGML_OP_CONT:
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                    break;
-                default:
-                    return false;
-            }
-            switch (op->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                    break;
-                default:
-                    return false;
-            }
-            return true;
-        case GGML_OP_DIAG_MASK_INF:
-            return op->ne[3] == 1;
-        case GGML_OP_GET_ROWS:
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q4_0:
-                case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q6_K:
-                    return op->ne[2] == 1 && op->ne[3] == 1;
-                default:
-                    ;
-            }
-            return false;
-        case GGML_OP_MUL_MAT:
-            if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1]))
-                return false;
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F32:
-                    return op->ne[3] == 1;
-                case GGML_TYPE_Q6_K:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q4_0:
-                case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q4_K:
-                    return true;
-                default:
-                    ;
-            }
-        default:
-            ;
-    }
-    return false;
-    GGML_UNUSED(dev);
-}
-static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
-    const int n_seq = 8;
-    // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting
-    // it to the size of the graph, but I think it can be made smaller?
-    ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes);
-    std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
-    for (auto& sequence : sequences) {
-        sequence = komputeManager()->sequence();
-    }
-    for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
-        const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
-        auto& seq = *sequences[seq_idx];
-        const int node_start = (seq_idx + 0) * n_nodes_per_seq;
-        const int node_end   = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes);
-        bool any_commands_recorded = false;
-        for (int i = node_start; i < node_end; ++i) {
-            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
-            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
-            struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
-            struct ggml_tensor * dst = gf->nodes[i];
-            GGML_ASSERT(dst->data != nullptr);
-            if (ggml_is_empty(dst)) {
-                continue;
-            }
-            switch (dst->op) {
-                case GGML_OP_NONE:
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_PERMUTE:
-                    continue; // noop -> next node
-                default:
-                    break;
-            }
-            any_commands_recorded = true;
-            const int32_t ne00 = src0 ? src0->ne[0] : 0;
-            const int32_t ne01 = src0 ? src0->ne[1] : 0;
-            const int32_t ne02 = src0 ? src0->ne[2] : 0;
-            const int32_t ne03 = src0 ? src0->ne[3] : 0;
-            const uint32_t nb00 = src0 ? src0->nb[0] : 0;
-            const uint32_t nb01 = src0 ? src0->nb[1] : 0;
-            const uint32_t nb02 = src0 ? src0->nb[2] : 0;
-            const uint32_t nb03 = src0 ? src0->nb[3] : 0;
-            const int32_t ne10 = src1 ? src1->ne[0] : 0;
-            const int32_t ne11 = src1 ? src1->ne[1] : 0;
-            const int32_t ne12 = src1 ? src1->ne[2] : 0;
-            const int32_t ne13 = src1 ? src1->ne[3] : 0;
-            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
-            const uint32_t nb11 = src1 ? src1->nb[1] : 0;
-            const uint32_t nb12 = src1 ? src1->nb[2] : 0;
-            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
-            const int32_t ne0 = dst ? dst->ne[0] : 0;
-            const int32_t ne1 = dst ? dst->ne[1] : 0;
-            const int32_t ne2 = dst ? dst->ne[2] : 0;
-//            const int32_t ne3 = dst ? dst->ne[3] : 0;
-            const uint32_t nb0 = dst ? dst->nb[0] : 0;
-            const uint32_t nb1 = dst ? dst->nb[1] : 0;
-            const uint32_t nb2 = dst ? dst->nb[2] : 0;
-            const uint32_t nb3 = dst ? dst->nb[3] : 0;
-            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
-            const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
-            const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
-            uint32_t off_src0 = 0;
-            uint32_t off_src1 = 0;
-            uint32_t off_src2 = 0;
-            uint32_t off_dst  = 0;
-            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
-            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
-            const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
-            const std::shared_ptr<kp::Tensor>& id_dst  = dst  ? ggml_vk_get_tensor(dst,  &off_dst)  : nullTensor;
-            switch (dst->op) {
-                case GGML_OP_ADD:
-                    {
-                        if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
-                            // src1 is a row
-                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
-                        } else {
-                            ggml_vk_add(
-                                seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                ne00, ne01, ne02, ne03,
-                                nb00, nb01, nb02, nb03,
-                                ne10, ne11, ne12, ne13,
-                                nb10, nb11, nb12, nb13,
-                                ne0,
-                                nb0, nb1, nb2, nb3
-                            );
-                        }
-                    } break;
-                case GGML_OP_MUL:
-                    {
-                        ggml_vk_mul(
-                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                            ne00, ne01, ne02, ne03,
-                            nb00, nb01, nb02, nb03,
-                            ne10, ne11, ne12, ne13,
-                            nb10, nb11, nb12, nb13,
-                            ne0,
-                            nb0, nb1, nb2, nb3
-                        );
-                    } break;
-                case GGML_OP_SCALE:
-                    {
-                        float scale; memcpy(&scale, dst->op_params, sizeof(float));
-                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
-                    } break;
-                case GGML_OP_UNARY:
-                    {
-                        int64_t n = ggml_nelements(dst);
-                        GGML_ASSERT(n % 4 == 0);
-                        switch (ggml_get_unary_op(gf->nodes[i])) {
-                            case GGML_UNARY_OP_SILU:
-                                {
-                                    ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
-                                } break;
-                            case GGML_UNARY_OP_RELU:
-                                {
-                                    ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
-                                } break;
-                            case GGML_UNARY_OP_GELU:
-                                {
-                                    GGML_ASSERT(n % 8 == 0);
-                                    ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8);
-                                } break;
-                            default:
-                                {
-                                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                    GGML_ABORT("fatal error");
-                                }
-                        }
-                    } break;
-                case GGML_OP_SOFT_MAX:
-                    {
-                        float scale;
-                        float max_bias;
-                        memcpy(&scale,    (float *)dst->op_params + 0, sizeof(float));
-                        memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
-#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
-#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5021")
-                        GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
-                        const int64_t nrows_x = ggml_nrows(src0);
-                        const int64_t nrows_y = src0->ne[1];
-                        const uint32_t n_head      = nrows_x/nrows_y;
-                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
-                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-                        ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
-                    } break;
-                case GGML_OP_DIAG_MASK_INF:
-                    {
-                        const int n_past = ((int32_t *)(dst->op_params))[0];
-                        ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02);
-                    } break;
-                case GGML_OP_NORM:
-                    {
-                        float eps;
-                        memcpy(&eps, dst->op_params, sizeof(float));
-                        ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
-                    } break;
-                case GGML_OP_RMS_NORM:
-                    {
-                        GGML_ASSERT(ne00 % 4 == 0);
-                        float eps;
-                        memcpy(&eps, dst->op_params, sizeof(float));
-                        ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
-                    } break;
-                case GGML_OP_MUL_MAT:
-                    {
-                        GGML_ASSERT(ne00 == ne10);
-                        GGML_ASSERT(ne12 % ne02 == 0);
-                        GGML_ASSERT(ne13 % ne03 == 0);
-                        const uint32_t r2 = ne12/ne02;
-                        const uint32_t r3 = ne13/ne03;
-                        if (src1t != GGML_TYPE_F32) {
-                            fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
-                            goto not_implemented;
-                        }
-                        if (ggml_is_transposed(src0) ||
-                            ggml_is_transposed(src1)) {
-                            fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
-                            goto not_implemented;
-                        }
-                        switch (src0t) {
-                            case GGML_TYPE_F32:
-                                ggml_vk_mul_mat_mat_f32(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, nb1, nb2
-                                );
-                                break;
-                            case GGML_TYPE_F16:
-                                ggml_vk_mul_mat_f16(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                    ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
-                                    ne0, ne1, r2, r3
-                                );
-                                break;
-                            case GGML_TYPE_Q8_0:
-                                ggml_vk_mul_mat_q8_0(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
-                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
-                                );
-                                break;
-                            case GGML_TYPE_Q4_0:
-                                ggml_vk_mul_mat_q4_0(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
-                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
-                                );
-                                break;
-                            case GGML_TYPE_Q4_1:
-                                ggml_vk_mul_mat_q4_1(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
-                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
-                                );
-                                break;
-                            case GGML_TYPE_Q4_K:
-                                ggml_vk_mul_mat_q4_k(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
-                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
-                                );
-                                break;
-                            case GGML_TYPE_Q6_K:
-                                ggml_vk_mul_mat_q6_k(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
-                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
-                                );
-                                break;
-                            default: {
-                                fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
-                                goto not_implemented;
-                            }
-                        }
-                    } break;
-                case GGML_OP_GET_ROWS:
-                    {
-                        if (src0t == GGML_TYPE_F32) {
-                            ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0t == GGML_TYPE_F16) {
-                            ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0t == GGML_TYPE_Q4_0) {
-                            ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0t == GGML_TYPE_Q4_1) {
-                            ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0t == GGML_TYPE_Q6_K) {
-                            ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
-                            goto not_implemented;
-                        }
-                    } break;
-                case GGML_OP_ROPE:
-                    {
-                        GGML_ASSERT(ne10 == ne02);
-                        GGML_ASSERT(src0t == dstt);
-                        // const int n_past = ((int32_t *) dst->op_params)[0];
-                        const int n_dims     = ((int32_t *) dst->op_params)[1];
-                        const int mode       = ((int32_t *) dst->op_params)[2];
-                        // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
-                        const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
-                        const bool has_freq_factors = dst->src[2] != nullptr;
-                        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-                        memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-                        memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-                        memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-                        memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-                        memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-                        memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-                        ggml_vk_rope(
-                            seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
-                            freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
-                            ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
-                        );
-                    } break;
-                case GGML_OP_DUP:
-                case GGML_OP_CPY:
-                case GGML_OP_CONT:
-                    {
-                        switch (src0t) {
-                            case GGML_TYPE_F32:
-                                {
-                                    switch (dstt) {
-                                        case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
-                                        case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
-                                        default: goto not_implemented;
-                                    }
-                                } break;
-                            case GGML_TYPE_F16:
-                                {
-                                    switch (dstt) {
-                                        case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
-                                        case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
-                                    default: goto not_implemented;
-                                } break;
-                            default: goto not_implemented;
-                            }
-                        }
-                    } break;
-                default: goto not_implemented;
-            }
-            continue;
-            not_implemented: {}
-            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-            //GGML_ABORT("fatal error");
-        }
-        // Evaluate sequence
-        if (any_commands_recorded) {
-            seq.evalAsync();
-        }
-    }
-    // Wait for all sequences to finish
-    for (auto& sequence : sequences) {
-        if (sequence->isRunning())
-            sequence->evalAwait();
-    }
-    ggml_vk_free_descriptor_pool(ctx);
-}
-template<>
-kp::Tensor::TensorDataTypes
-kp::TensorT<half>::dataType()
-{
-    return TensorDataTypes::eFloat;
-}
-template<>
-kp::Tensor::TensorDataTypes
-kp::TensorT<uint8_t>::dataType()
-{
-    return TensorDataTypes::eUnsignedInt;
-}
-////////////////////////////////////////////////////////////////////////////////
-// backend interface
-struct ggml_backend_kompute_buffer_type_context {
-    int         device;
-    int         device_ref = 0;
-    uint64_t    buffer_alignment;
-    uint64_t    max_alloc;
-    std::string name;
-    ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc)
-        : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {}
-};
-static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
-    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-    if (!ctx->device_ref) {
-        komputeManager()->initializeDevice(
-            ctx->device, {}, {
-                "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
-                "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
-            }
-        );
-    }
-    assert(ggml_vk_has_device());
-    ctx->device_ref++;
-}
-static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
-    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-    assert(ctx->device_ref > 0);
-    ctx->device_ref--;
-    if (!ctx->device_ref) {
-        komputeManager.destroy();
-    }
-}
-static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    auto * memory = (ggml_vk_memory *)buffer->context;
-    if (ggml_vk_has_device()) {
-        ggml_vk_free_memory(*memory);
-    }
-    delete memory;
-}
-static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return ((ggml_vk_memory *)buffer->context)->data;
-}
-static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
-    const auto res = ggml_vk_get_tensor(tensor);
-    GGML_ASSERT(res);
-    memcpy((char *)tensor->data + offset, data, size);
-    komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
-}
-static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
-    const auto res = ggml_vk_get_tensor(tensor);
-    GGML_ASSERT(res);
-    komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
-    memcpy(data, (const char *)tensor->data + offset, size);
-}
-static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    auto * memory = (ggml_vk_memory *)buffer->context;
-    memset(memory->data, value, buffer->size);
-    if (memory->stagingBuffer)
-        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory->primaryBuffer, memory->stagingBuffer, memory->size);
-}
-static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
-    /* .free_buffer     = */ ggml_backend_kompute_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_kompute_buffer_get_base,
-    /* .init_tensor     = */ NULL,
-    /* .memset_tensor   = */ NULL,
-    /* .set_tensor      = */ ggml_backend_kompute_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_kompute_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ ggml_backend_kompute_buffer_clear,
-    /* .reset           = */ NULL,
-};
-// default buffer type
-static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-    return ctx->name.c_str();
-}
-static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_kompute_device_ref(buft);
-    auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
-    return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
-}
-static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-    return ctx->buffer_alignment;
-}
-static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-    return ctx->max_alloc;
-}
-static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
-    /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
-    /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
-    /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-    /* .is_host          = */ NULL,
-};
-ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
-    static std::mutex mutex;
-    std::lock_guard<std::mutex> lock(mutex);
-    auto devices = ggml_vk_available_devices();
-    int32_t device_count = (int32_t) devices.size();
-    GGML_ASSERT(device < device_count);
-    GGML_ASSERT(devices.size() <= GGML_KOMPUTE_MAX_DEVICES);
-    static ggml_backend_buffer_type
-        ggml_backend_kompute_buffer_types[GGML_KOMPUTE_MAX_DEVICES];
-    static bool ggml_backend_kompute_buffer_type_initialized = false;
-    if (!ggml_backend_kompute_buffer_type_initialized) {
-        for (int32_t i = 0; i < device_count; i++) {
-            ggml_backend_kompute_buffer_types[i] = {
-                /* .iface    = */ ggml_backend_kompute_buffer_type_interface,
-                /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), i),
-                /* .context  = */ new ggml_backend_kompute_buffer_type_context{ i, devices[i].bufferAlignment, devices[i].maxAlloc },
-            };
-        }
-        ggml_backend_kompute_buffer_type_initialized = true;
-    }
-    return &ggml_backend_kompute_buffer_types[device];
-}
-// backend
-static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
-    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
-    return ctx->name.c_str();
-}
-static void ggml_backend_kompute_free(ggml_backend_t backend) {
-    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
-    assert(ctx == s_kompute_context);
-    s_kompute_context = nullptr;
-    if (ctx != nullptr) {
-        delete ctx;
-    }
-    delete backend;
-}
-static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
-    ggml_vk_graph_compute(ctx, cgraph);
-    return GGML_STATUS_SUCCESS;
-}
-static struct ggml_backend_i kompute_backend_i = {
-    /* .get_name                = */ ggml_backend_kompute_name,
-    /* .free                    = */ ggml_backend_kompute_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-static ggml_guid_t ggml_backend_kompute_guid() {
-    static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49, 0xfb, 0x35, 0xfa, 0x9b, 0x18, 0x31, 0x1d, 0xca };
-    return &guid;
-}
-ggml_backend_t ggml_backend_kompute_init(int device) {
-    GGML_ASSERT(s_kompute_context == nullptr);
-    s_kompute_context = new ggml_kompute_context(device);
-    ggml_backend_t kompute_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_kompute_guid(),
-        /* .interface = */ kompute_backend_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), device),
-        /* .context   = */ s_kompute_context,
-    };
-    return kompute_backend;
-}
-bool ggml_backend_is_kompute(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
-}
-static size_t ggml_backend_kompute_get_device_count() {
-    auto devices = ggml_vk_available_devices();
-    return devices.size();
-}
-static void ggml_backend_kompute_get_device_description(int device, char * description, size_t description_size) {
-    auto devices = ggml_vk_available_devices();
-    GGML_ASSERT((size_t) device < devices.size());
-    snprintf(description, description_size, "%s", devices[device].name);
-}
-static void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) {
-    auto devices = ggml_vk_available_devices();
-    GGML_ASSERT((size_t) device < devices.size());
-    *total = devices[device].heapSize;
-    *free = devices[device].heapSize;
-}
-//////////////////////////
-struct ggml_backend_kompute_device_context {
-    int device;
-    std::string name;
-    std::string description;
-};
-static const char * ggml_backend_kompute_device_get_name(ggml_backend_dev_t dev) {
-    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
-    return ctx->name.c_str();
-}
-static const char * ggml_backend_kompute_device_get_description(ggml_backend_dev_t dev) {
-    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
-    return ctx->description.c_str();
-}
-static void ggml_backend_kompute_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
-    ggml_backend_kompute_get_device_memory(ctx->device, free, total);
-}
-static ggml_backend_buffer_type_t ggml_backend_kompute_device_get_buffer_type(ggml_backend_dev_t dev) {
-    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
-    return ggml_backend_kompute_buffer_type(ctx->device);
-}
-static bool ggml_backend_kompute_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    if (buft->iface.get_name != ggml_backend_kompute_buffer_type_get_name) {
-        return false;
-    }
-    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
-    ggml_backend_kompute_buffer_type_context * buft_ctx = (ggml_backend_kompute_buffer_type_context *)buft->context;
-    return buft_ctx->device == ctx->device;
-}
-static enum ggml_backend_dev_type ggml_backend_kompute_device_get_type(ggml_backend_dev_t dev) {
-    GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU;
-}
-static void ggml_backend_kompute_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_kompute_device_get_name(dev);
-    props->description = ggml_backend_kompute_device_get_description(dev);
-    props->type        = ggml_backend_kompute_device_get_type(dev);
-    ggml_backend_kompute_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* async                  = */ false,
-        /* host_buffer            = */ false,
-        /* .buffer_from_host_ptr  = */ false,
-        /* events                 = */ false,
-    };
-}
-static ggml_backend_t ggml_backend_kompute_device_init(ggml_backend_dev_t dev, const char * params) {
-    GGML_UNUSED(params);
-    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
-    return ggml_backend_kompute_init(ctx->device);
-}
-static bool ggml_backend_kompute_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
-    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
-           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
-    GGML_UNUSED(dev);
-}
-static const struct ggml_backend_device_i ggml_backend_kompute_device_i = {
-    /* .get_name             = */ ggml_backend_kompute_device_get_name,
-    /* .get_description      = */ ggml_backend_kompute_device_get_description,
-    /* .get_memory           = */ ggml_backend_kompute_device_get_memory,
-    /* .get_type             = */ ggml_backend_kompute_device_get_type,
-    /* .get_props            = */ ggml_backend_kompute_device_get_props,
-    /* .init_backend         = */ ggml_backend_kompute_device_init,
-    /* .get_buffer_type      = */ ggml_backend_kompute_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ NULL,
-    /* .supports_op          = */ ggml_backend_kompute_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_kompute_device_supports_buft,
-    /* .offload_op           = */ ggml_backend_kompute_device_offload_op,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-static const char * ggml_backend_kompute_reg_get_name(ggml_backend_reg_t reg) {
-    GGML_UNUSED(reg);
-    return "Kompute";
-}
-static size_t ggml_backend_kompute_reg_get_device_count(ggml_backend_reg_t reg) {
-    GGML_UNUSED(reg);
-    return ggml_backend_kompute_get_device_count();
-}
-static ggml_backend_dev_t ggml_backend_kompute_reg_get_device(ggml_backend_reg_t reg, size_t device) {
-    static std::vector<ggml_backend_dev_t> devices;
-    static bool initialized = false;
-    {
-        static std::mutex mutex;
-        std::lock_guard<std::mutex> lock(mutex);
-        if (!initialized) {
-            for (size_t i = 0; i < ggml_backend_kompute_get_device_count(); i++) {
-                ggml_backend_kompute_device_context * ctx = new ggml_backend_kompute_device_context;
-                char desc[256];
-                ggml_backend_kompute_get_device_description(i, desc, sizeof(desc));
-                ctx->device = i;
-                ctx->name = "Kompute" + std::to_string(i);
-                ctx->description = desc;
-                devices.push_back(new ggml_backend_device {
-                    /* .iface   = */ ggml_backend_kompute_device_i,
-                    /* .reg     = */ reg,
-                    /* .context = */ ctx,
-                });
-            }
-            initialized = true;
-        }
-    }
-    GGML_ASSERT(device < devices.size());
-    return devices[device];
-}
-static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
-    /* .get_name         = */ ggml_backend_kompute_reg_get_name,
-    /* .get_device_count = */ ggml_backend_kompute_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_kompute_reg_get_device,
-    /* .get_proc_address = */ NULL,
-};
-ggml_backend_reg_t ggml_backend_kompute_reg() {
-    static ggml_backend_reg reg = {
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_kompute_reg_i,
-        /* .context     = */ nullptr,
-    };
-    return &reg;
-}
-GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)

ggml/src/ggml-kompute/kompute-shaders/common.comp DELETED Viewed

@@ -1,112 +0,0 @@
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
-#extension GL_EXT_control_flow_attributes: enable
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-#define QK4_0 32
-#define QK4_1 32
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-#define TWOPI_F 6.283185307179586f
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-#define sizeof_block_q4_0 0x12
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
-    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
-    const float d2 = d1 / 256.f;
-    const float md = -8.f * xb.d;
-    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
-    const uint16_t mask1 = mask0 << 8;
-    mat4 reg;
-    for (int i=0;i<8;i++) {
-        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
-        reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
-        reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
-    }
-    return reg;
-}
-#define sizeof_block_q4_1 0x14
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
-    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
-    const float d2 = d1 / 256.f;
-    const float  m = xb.m;
-    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
-    const uint16_t mask1 = mask0 << 8;
-    mat4 reg;
-    for (int i=0;i<8;i++) {
-        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
-        reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
-        reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
-    }
-    return reg;
-}
-#define sizeof_block_q4_k 144
-struct block_q4_k {
-    float16_t d;
-    float16_t dmin;
-    uint8_t scales[K_SCALE_SIZE];
-    uint8_t qs[QK_K/2];
-};
-#define sizeof_block_q6_k 210
-struct block_q6_k {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;             // super-block scale
-};
-mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
-    const float16_t d_all = xb.d;
-    const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
-    const uint qhIndex = 32*(il/8) + 16*(il&1);
-    float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
-    il = (il/2) & 3;
-    const uint16_t  kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
-    const uint16_t  kmask2 = il>1 ? uint8_t(0xF0)             : uint8_t(0x0F);
-    const float16_t coef   = il>1 ? float16_t(1.f/16.f)       : float16_t(1.f);
-    const float16_t ml = float16_t(d_all * sc * 32.f);
-    const float16_t dl = float16_t(d_all * sc * coef);
-    mat4 reg;
-    for (int i = 0; i < 16; ++i) {
-        const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
-                                        : ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
-        reg[i/4][i%4] = dl * q - ml;
-    }
-    return reg;
-}
-#define QK8_0 32
-// struct block_q8_0 {
-//     float16_t d;         // delta
-//     int8_t    qs[QK8_0]; // quants
-// };
-#define sizeof_block_q8_0 34

ggml/src/ggml-kompute/kompute-shaders/op_add.comp DELETED Viewed

@@ -1,58 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1024) in;
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb00;
-    int nb01;
-    int nb02;
-    int nb03;
-    int ne10;
-    int ne11;
-    int ne12;
-    int ne13;
-    int nb10;
-    int nb11;
-    int nb12;
-    int nb13;
-    int ne0;
-    int nb0;
-    int nb1;
-    int nb2;
-    int nb3;
-  //int offs; // TODO: needed for GGML_OP_ACC, see metal code
-} pcs;
-// general-purpose kernel for addition of two tensors
-// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
-// cons: not very efficient
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-    const uint i13 = i03 % pcs.ne13;
-    const uint i12 = i02 % pcs.ne12;
-    const uint i11 = i01 % pcs.ne11;
-    int offs = 0; // TMP (see above)
-    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
-    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11       ) / 4);
-    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + offs) / 4);
-    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
-        const uint i10 = i0 % pcs.ne10;
-        out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp DELETED Viewed

@@ -1,25 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    uint row;
-} pcs;
-void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
-    for (uint x = 0; x < 4; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "common.comp"
-#define IN_TYPE float16_t
-#define IN_TYPE_SIZE 2
-#define OUT_TYPE float16_t
-#define OUT_TYPE_SIZE 2
-layout(local_size_x = 1024) in;
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-layout (push_constant) uniform parameter {
-    uint inOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    int ne1;
-    int ne2;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
-    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
-    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
-    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
-        out_[dst_data+i00] = OUT_TYPE(in_[src]);
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "common.comp"
-#define IN_TYPE float16_t
-#define IN_TYPE_SIZE 2
-#define OUT_TYPE float
-#define OUT_TYPE_SIZE 4
-layout(local_size_x = 1024) in;
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-layout (push_constant) uniform parameter {
-    uint inOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    int ne1;
-    int ne2;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
-    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
-    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
-    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
-        out_[dst_data+i00] = OUT_TYPE(in_[src]);
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "common.comp"
-#define IN_TYPE float
-#define IN_TYPE_SIZE 4
-#define OUT_TYPE float16_t
-#define OUT_TYPE_SIZE 2
-layout(local_size_x = 1024) in;
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-layout (push_constant) uniform parameter {
-    uint inOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    int ne1;
-    int ne2;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
-    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
-    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
-    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
-        out_[dst_data+i00] = OUT_TYPE(in_[src]);
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "common.comp"
-#define IN_TYPE float
-#define IN_TYPE_SIZE 4
-#define OUT_TYPE float
-#define OUT_TYPE_SIZE 4
-layout(local_size_x = 1024) in;
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-layout (push_constant) uniform parameter {
-    uint inOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    int ne1;
-    int ne2;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
-    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
-    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
-    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
-        out_[dst_data+i00] = OUT_TYPE(in_[src]);
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp DELETED Viewed

@@ -1,30 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-    uint n_past;
-    int ne00;
-    int ne01;
-} pcs;
-void main() {
-    const uint i02 = gl_WorkGroupID.z;
-    const uint i01 = gl_WorkGroupID.y;
-    const uint i00 = gl_WorkGroupID.x;
-    const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
-    if (i00 > pcs.n_past + i01) {
-        out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
-    } else {
-        out_[index + pcs.outOff] = in_[index + pcs.inOff];
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp DELETED Viewed

@@ -1,22 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-} pcs;
-void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 8;
-    for (uint x = 0; x < 8; x++) {
-        const uint i = baseIndex + x;
-        const float y = in_[i + pcs.inOff];
-        out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp DELETED Viewed

@@ -1,17 +0,0 @@
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-    int z = 0;
-    for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
-        const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
-        const mat4 result = dequantize_block(inIndex, ind%NL);
-        for (uint j = 0; j < 4; ++j) {
-            for (uint k = 0; k < 4; ++k) {
-                const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
-                out_[outIndex] = result[j][k];
-                ++z;
-            }
-        }
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp DELETED Viewed

@@ -1,31 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1) in;
-layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb01;
-    int nb1;
-} pcs;
-void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
-    for (int j = 0; j < k; j++) {
-        out_[y + j] = inA[x + j];
-    }
-}
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-    dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
-}

ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp DELETED Viewed

@@ -1,31 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1) in;
-layout (binding = 0) readonly buffer tensorInA { float inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb01;
-    int nb1;
-} pcs;
-void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
-    for (int j = 0; j < k; j++) {
-        out_[y + j] = inA[x + j];
-    }
-}
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-    dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
-}

ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp DELETED Viewed

@@ -1,38 +0,0 @@
-#version 450
-#include "common.comp"
-#define NL 2
-#define BYTES_FOR_TYPE 4 /*bytes for float*/
-#define SIZE_OF_BLOCK sizeof_block_q4_0
-layout(local_size_x = 1) in;
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb01;
-    int nb1;
-} pcs;
-block_q4_0 get_unaligned_block_q4_0(uint index) {
-    block_q4_0 fres;
-    fres.d = u8BufToFloat16(inA, index);
-    [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
-        fres.qs[it] = inA[index+2+it];
-    }
-    return fres;
-}
-mat4 dequantize_block(uint index, uint il) {
-    const block_q4_0 block = get_unaligned_block_q4_0(index);
-    return dequantize_q4_0(block, il);
-}
-#include "op_getrows.comp"

ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp DELETED Viewed

@@ -1,39 +0,0 @@
-#version 450
-#include "common.comp"
-#define NL 2
-#define BYTES_FOR_TYPE 4 /*bytes for float*/
-#define SIZE_OF_BLOCK sizeof_block_q4_1
-layout(local_size_x = 1) in;
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb01;
-    int nb1;
-} pcs;
-block_q4_1 get_unaligned_block_q4_1(uint index) {
-    block_q4_1 fres;
-    fres.d = u8BufToFloat16(inA, index);
-    fres.m = u8BufToFloat16(inA, index+2);
-    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
-        fres.qs[it] = inA[index+4+it];
-    }
-    return fres;
-}
-mat4 dequantize_block(uint index, uint il) {
-    const block_q4_1 block = get_unaligned_block_q4_1(index);
-    return dequantize_q4_1(block, il);
-}
-#include "op_getrows.comp"

ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp DELETED Viewed

@@ -1,44 +0,0 @@
-#version 450
-#include "common.comp"
-#define NL 16
-#define BYTES_FOR_TYPE 4 /*bytes for float*/
-#define SIZE_OF_BLOCK sizeof_block_q6_k
-layout(local_size_x = 1) in;
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb01;
-    int nb1;
-} pcs;
-block_q6_k get_unaligned_block_q6_k(uint index) {
-    block_q6_k fres;
-    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
-        fres.ql[it] = inA[index + it];
-    }
-    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
-        fres.qh[it] = inA[index + QK_K/2 + it];
-    }
-    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
-        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
-    }
-    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
-    return fres;
-}
-mat4 dequantize_block(uint index, uint il) {
-    const block_q6_k block = get_unaligned_block_q6_k(index);
-    return dequantize_q6_k(block, il);
-}
-#include "op_getrows.comp"

ggml/src/ggml-kompute/kompute-shaders/op_mul.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1024) in;
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb00;
-    int nb01;
-    int nb02;
-    int nb03;
-    int ne10;
-    int ne11;
-    int ne12;
-    int ne13;
-    int nb10;
-    int nb11;
-    int nb12;
-    int nb13;
-    int ne0;
-    int nb0;
-    int nb1;
-    int nb2;
-    int nb3;
-} pcs;
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-    const uint i13 = i03 % pcs.ne13;
-    const uint i12 = i02 % pcs.ne12;
-    const uint i11 = i01 % pcs.ne11;
-    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
-    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
-    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1)  / 4);
-    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
-        const uint i10 = i0 % pcs.ne10;
-        out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp DELETED Viewed

@@ -1,69 +0,0 @@
-#version 450
-#include "common.comp"
-#extension GL_KHR_shader_subgroup_arithmetic : require
-layout(local_size_x_id = 0) in;
-layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne10;
-    int ne11;
-    int ne12;
-    uint nb10;
-    uint nb11;
-    uint nb12;
-    uint nb13;
-    int ne0;
-    int ne1;
-    uint r2;
-    uint r3;
-} pcs;
-#define N_F16_F32 4
-void main() {
-    const uint r0 = gl_WorkGroupID.x;
-    const uint rb = gl_WorkGroupID.y*N_F16_F32;
-    const uint im = gl_WorkGroupID.z;
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
-    const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
-    for (uint row = 0; row < N_F16_F32; ++row) {
-        uint r1 = rb + row;
-        if (r1 >= pcs.ne11) {
-            break;
-        }
-        const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
-        float sumf = 0;
-        for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
-            sumf += float(inA[x+i]) * float(inB[y+i]);
-        }
-        const float all_sum = subgroupAdd(sumf);
-        if (subgroupElect()) {
-            out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
-        }
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp DELETED Viewed

@@ -1,51 +0,0 @@
-#version 450
-#include "common.comp"
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-// device subgroup size
-layout (local_size_x_id = 0) in;
-layout(binding = 0) readonly buffer tensorInA { float inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-void main() {
-  uvec3 gid = gl_WorkGroupID;
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
-      sum += float(inA[x+i]) * float(inB[y+i]);
-  }
-  const float all_sum = subgroupAdd(sum);
-  if (subgroupElect()) {
-    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
-  }
-}

ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp DELETED Viewed

@@ -1,33 +0,0 @@
-#version 450
-#include "common.comp"
-#define BLOCKS_IN_QUANT QK4_0
-#define SIZE_OF_BLOCK sizeof_block_q4_0
-#define N_ROWS 4
-#include "op_mul_mv_q_n_pre.comp"
-// The q4_0 version of this function
-float block_q_n_dot_y(uint block_index, uint yb, uint il) {
-    vec2 acc = vec2(0.0, 0.0);
-    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
-    float d = float(u8BufToFloat16(inA, index));
-    float sumy = 0.0f;
-    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
-        const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
-        const float yl0 = inB[yb + i];
-        const float yl1 = inB[yb + i + 1];
-        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
-        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
-        sumy += yl0 + yl1 + yl8 + yl9;
-        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
-        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
-    }
-    return d * (sumy * -8.f + acc[0] + acc[1]);
-}
-#include "op_mul_mv_q_n.comp"

ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp DELETED Viewed

@@ -1,35 +0,0 @@
-#version 450
-#include "common.comp"
-#define BLOCKS_IN_QUANT QK4_1
-#define SIZE_OF_BLOCK sizeof_block_q4_1
-#define N_ROWS 4
-#include "op_mul_mv_q_n_pre.comp"
-// The q4_1 version of this function
-float block_q_n_dot_y(uint block_index, uint yb, uint il) {
-    vec2 acc = vec2(0.0, 0.0);
-    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
-    float d = float(u8BufToFloat16(inA, index));
-    float m = float(u8BufToFloat16(inA, index+2));
-    float sumy = 0.0f;
-    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
-        const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
-        const float yl0 = inB[yb + i];
-        const float yl1 = inB[yb + i + 1];
-        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
-        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
-        sumy += yl0 + yl1 + yl8 + yl9;
-        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
-        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
-    }
-    return d * (acc[0] + acc[1]) + sumy * m;
-}
-#include "op_mul_mv_q_n.comp"

ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp DELETED Viewed

@@ -1,140 +0,0 @@
-#version 450
-#include "common.comp"
-#define N_DST 4
-#define SIZE_OF_BLOCK sizeof_block_q4_k
-layout(local_size_x = 4) in;
-layout(local_size_y = 8) in;
-layout(local_size_z = 1) in;
-layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne10;
-    int ne0;
-    int ne1;
-    int ne01;
-    int ne02;
-    int ne12;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    uint nb11;
-    uint nb12;
-    uint nb13;
-    uint r2;
-    uint r3;
-} pcs;
-void main() {
-    const uint16_t kmask1 = uint16_t(0x3f3f);
-    const uint16_t kmask2 = uint16_t(0x0f0f);
-    const uint16_t kmask3 = uint16_t(0xc0c0);
-    const uint ix = gl_SubgroupInvocationID/8;  // 0...3
-    const uint it = gl_SubgroupInvocationID%8;  // 0...7
-    const uint iq = it/4;     // 0 or 1
-    const uint ir = it%4;     // 0...3
-    const uint nb = pcs.ne00/QK_K;
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-    const uint im = gl_WorkGroupID.z;
-    const uint first_row = r0 * N_DST;
-    const uint ib_row = first_row * nb;
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
-    const uint offset1 =        r1*pcs.nb11 + (i12       )*pcs.nb12 + (i13       )*pcs.nb13;
-    const uint xblk = offset0 + pcs.inAOff;
-    const uint y = (offset1 / 4) + pcs.inBOff;
-    float yl[16];
-    float yh[16];
-    float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
-    float all_sum = 0.f;
-    uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
-    for (uint ib = ix; ib < nb; ib += 4) {
-        const uint blk_idx = ib + xblk;
-        float sumy[4] = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+0] = inB[y4+i+  0]; sumy[0] += yl[i+0];
-            yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
-            yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
-            yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
-        }
-        for (int row = 0; row < N_DST; row++) {
-            uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
-            uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
-            uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
-            uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
-            uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
-            uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
-            uint16_t sc16[4];
-            sc16[0] = sc_0 & kmask1;
-            sc16[1] = sc_2 & kmask1;
-            sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
-            sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
-            float acc1[4] = {0.f, 0.f, 0.f, 0.f};
-            float acc2[4] = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
-                uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
-                acc1[0] += yl[i+0] * (q1 & 0x000F);
-                acc1[1] += yl[i+1] * (q1 & 0x0F00);
-                acc1[2] += yl[i+8] * (q1 & 0x00F0);
-                acc1[3] += yl[i+9] * (q1 & 0xF000);
-                acc2[0] += yh[i+0] * (q2 & 0x000F);
-                acc2[1] += yh[i+1] * (q2 & 0x0F00);
-                acc2[2] += yh[i+8] * (q2 & 0x00F0);
-                acc2[3] += yh[i+9] * (q2 & 0xF000);
-            }
-            uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
-            uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
-            uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
-            uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
-            uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
-            uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
-            uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
-            uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
-            float dall = float(inA[blk_idx + row_idx].d);
-            float dmin = float(inA[blk_idx + row_idx].dmin);
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
-                               (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
-                               (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
-                               (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
-                dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
-        }
-        y4 += 4 * QK_K;
-    }
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = subgroupAdd(sumf[row]);
-        if (subgroupElect()) {
-            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
-        }
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp DELETED Viewed

@@ -1,106 +0,0 @@
-#version 450
-#include "common.comp"
-#define SIZE_OF_BLOCK sizeof_block_q6_k
-layout(local_size_x_id = 0) in;
-layout(local_size_y_id = 1) in;
-layout(local_size_z = 1) in;
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne10;
-    int ne0;
-    int ne1;
-    int ne01;
-    int ne02;
-    int ne12;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    uint nb11;
-    uint nb12;
-    uint nb13;
-    uint r2;
-    uint r3;
-} pcs;
-void main() {
-    const uint8_t kmask1 = uint8_t(0x03);
-    const uint8_t kmask2 = uint8_t(0x0C);
-    const uint8_t kmask3 = uint8_t(0x30);
-    const uint8_t kmask4 = uint8_t(0xC0);
-    const uint nb = pcs.ne00/QK_K;
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-    const uint im = gl_WorkGroupID.z;
-    const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
-    const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
-    float sumf = 0;
-    // bits of invocation ID for gl_SubgroupSize=32:
-    //  x   x   x   x   x
-    //  4   3   2   1   0
-    // (     tid     ) ix
-    //  ip (   il    )
-    const uint block_stride = gl_SubgroupSize / 16;         // number of blocks each subgroup processes
-    const uint tid  = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
-    const uint ix   = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
-    const uint ip   = tid/8;        // first or second half of block (0 or 1)
-    const uint il   = tid%8;        // each half has 8 parts, one per scale
-    const uint n    = 4;            // 4 scales at a time (and 4 sums)
-    const uint l0   = n*il;         // offset into half-block, 0..28
-    const uint is   = 8*ip + l0/16; // 0, 1, 8, 9
-    const uint y_offset = 128*ip + l0;
-    const uint q_offset_l = 64*ip + l0;
-    const uint q_offset_h = 32*ip + l0;
-    for (uint i = ix; i < nb; i += block_stride) {
-        const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
-        const uint qlIndex = q_offset_l;
-        const uint q2Index = qlIndex + QK_K/8;
-        const uint qhIndex = q_offset_h;
-        const uint y = yy + i * QK_K + y_offset;
-        float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (uint l = 0; l < n; ++l) {
-            const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
-            const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
-            const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
-            sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
-            sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
-            sums[2] += inB[y+l+64] * (int8_t((currentQ1  >> 4) | ((currentQh & kmask3) << 0)) - 32);
-            sums[3] += inB[y+l+96] * (int8_t((currentQ2  >> 4) | ((currentQh & kmask4) >> 2)) - 32);
-        }
-        float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
-        sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
-    }
-    const float tot = subgroupAdd(sumf);
-    if (subgroupElect()) {
-        out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp DELETED Viewed

@@ -1,73 +0,0 @@
-#version 450
-#include "common.comp"
-#include "op_mul_mv_q_n_pre.comp"
-#define SIZE_OF_D 2
-#define N_DST 4 // each SIMD group works on 4 rows
-#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
-#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
-#define NB_Q8_0 8
-void main() {
-    // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
-    if (gl_SubgroupInvocationID > 31)
-        return;
-    const int nr  = N_DST;
-    const int nsg = N_SIMDGROUP;
-    const int nw  = N_SIMDWIDTH;
-    const int nb = pcs.ne00/QK8_0;
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-    const uint im = gl_WorkGroupID.z;
-    const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
-    const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
-    const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
-    float yl[NB_Q8_0];
-    float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
-    const uint ix = gl_SubgroupInvocationID.x/4;
-    const uint il = gl_SubgroupInvocationID.x%4;
-    uint yb = y + ix * QK8_0 + NB_Q8_0*il;
-    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
-    for (uint ib = ix; ib < nb; ib += nw/4) {
-        for (int i = 0; i < NB_Q8_0; ++i) {
-            yl[i] = inB[yb + i];
-        }
-        for (int row = 0; row < nr; row++) {
-            const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
-            float sumq = 0.f;
-            for (int iq = 0; iq < NB_Q8_0; ++iq) {
-                const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
-                sumq += qs_iq * yl[iq];
-            }
-            const float16_t d = u8BufToFloat16(inA, x + block_offset);
-            sumf[row] += sumq*d;
-        }
-        yb += NB_Q8_0 * nw;
-    }
-    for (int row = 0; row < nr; ++row) {
-        const float tot = subgroupAdd(sumf[row]);
-        if (subgroupElect() && first_row + row < pcs.ne01) {
-            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
-        }
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp DELETED Viewed

@@ -1,52 +0,0 @@
-void main() {
-    // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
-    if (gl_SubgroupInvocationID > 31)
-        return;
-    const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-    const uint im = gl_WorkGroupID.z;
-    const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    // pointers to src0 rows
-    uint ax[N_ROWS];
-    for (int row = 0; row < N_ROWS; ++row) {
-        const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
-        ax[row] = offset0 + pcs.inAOff;
-    }
-    const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
-    float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
-    const uint ix = gl_SubgroupInvocationID/2;
-    const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
-    uint yb = y + ix * BLOCKS_IN_QUANT + il;
-    //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
-    //    gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
-    //    gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
-    for (uint ib = ix; ib < nb; ib += 16) {
-        for (int row = 0; row < N_ROWS; row++) {
-            sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
-        }
-        yb += BLOCKS_IN_QUANT * 16;
-    }
-    for (int row = 0; row < N_ROWS; ++row) {
-        const float tot = subgroupAdd(sumf[row]);
-        if (first_row + row < pcs.ne01 && subgroupElect()) {
-            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
-        }
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp DELETED Viewed

@@ -1,28 +0,0 @@
-layout(local_size_x_id = 0) in;
-layout(local_size_y = 8) in;
-layout(local_size_z = 1) in;
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int  ne00;
-    int  ne01;
-    int  ne02;
-    int  ne10;
-    int  ne12;
-    int  ne0;
-    int  ne1;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    uint nb11;
-    uint nb12;
-    uint nb13;
-    uint r2;
-    uint r3;
-} pcs;

ggml/src/ggml-kompute/kompute-shaders/op_norm.comp DELETED Viewed

@@ -1,84 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 256) in;
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-    uint ne00;
-    uint nb01;
-    float eps;
-} pcs;
-shared float sum[gl_WorkGroupSize.x];
-void main() {
-    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
-    // MEAN
-    // parallel sum
-    sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        sum[gl_LocalInvocationID.x] += in_[x+i00];
-    }
-    // reduce
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-    // broadcast
-    if (gl_LocalInvocationID.x == 0) {
-        sum[0] /= float(pcs.ne00);
-    }
-    barrier();
-    memoryBarrierShared();
-    const float mean = sum[0];
-    // recenter
-    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        out_[y+i00] = in_[x+i00] - mean;
-    }
-    // VARIANCE
-    // parallel sum
-    sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
-    }
-    // reduce
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-    // broadcast
-    if (gl_LocalInvocationID.x == 0) {
-        sum[0] /= float(pcs.ne00);
-    }
-    barrier();
-    memoryBarrierShared();
-    const float variance = sum[0];
-    const float scale = 1.0f/sqrt(variance + pcs.eps);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        out_[y+i00] *= scale;
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_relu.comp DELETED Viewed

@@ -1,21 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-} pcs;
-void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
-    for (uint x = 0; x < 4; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp DELETED Viewed

@@ -1,53 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 512) in;
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-    uint ne00;
-    uint nb01;
-    float eps;
-} pcs;
-shared float sum[gl_WorkGroupSize.x];
-void main() {
-    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
-    // parallel sum
-    sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
-    }
-    // reduce
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-    // broadcast
-    if (gl_LocalInvocationID.x == 0) {
-        sum[0] /= float(pcs.ne00);
-    }
-    barrier();
-    memoryBarrierShared();
-    const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
-    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
-        out_[y+i00] = in_[x+i00] * scale;
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "rope_common.comp"
-layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
-layout(binding = 2) buffer restrict readonly  tensorInC { float     inC[]; };
-layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-    float theta_base = float(inB[pcs.inBOff + i2]);
-    float inv_ndims = -1.f/pcs.n_dims;
-    float cos_theta;
-    float sin_theta;
-    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
-        if (i0 < pcs.n_dims) {
-            uint ic = i0/2;
-            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
-            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
-            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + ic*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-            const float x0 = float(inA[src]);
-            const float x1 = float(inA[src+pcs.n_dims/2]);
-            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
-            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
-        } else {
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-            out_[dst_data]   = inA[src];
-            out_[dst_data+1] = inA[src+1];
-        }
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "rope_common.comp"
-layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
-layout(binding = 2) buffer restrict readonly  tensorInC { float inC[]; };
-layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-    float theta_base = float(inB[pcs.inBOff + i2]);
-    float inv_ndims = -1.f/pcs.n_dims;
-    float cos_theta;
-    float sin_theta;
-    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
-        if (i0 < pcs.n_dims) {
-            uint ic = i0/2;
-            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
-            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
-            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + ic*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-            const float x0 = inA[src];
-            const float x1 = inA[src+pcs.n_dims/2];
-            out_[dst_data]              = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
-        } else {
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-            out_[dst_data]   = inA[src];
-            out_[dst_data+1] = inA[src+1];
-        }
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "rope_common.comp"
-layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
-layout(binding = 2) buffer restrict readonly  tensorInC { float     inC[]; };
-layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-    float theta_base = float(inB[pcs.inBOff + i2]);
-    float inv_ndims = -1.f/pcs.n_dims;
-    float cos_theta;
-    float sin_theta;
-    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
-        if (i0 < pcs.n_dims) {
-            uint ic = i0/2;
-            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
-            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
-            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-            const float x0 = float(inA[src]);
-            const float x1 = float(inA[src+1]);
-            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
-            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
-        } else {
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-            out_[dst_data]   = inA[src];
-            out_[dst_data+1] = inA[src+1];
-        }
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "rope_common.comp"
-layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
-layout(binding = 2) buffer restrict readonly  tensorInC { float inC[]; };
-layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-    float theta_base = float(inB[pcs.inBOff + i2]);
-    float inv_ndims = -1.f/pcs.n_dims;
-    float cos_theta;
-    float sin_theta;
-    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
-        if (i0 < pcs.n_dims) {
-            uint ic = i0/2;
-            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
-            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
-            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-            const float x0 = inA[src];
-            const float x1 = inA[src+1];
-            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
-        } else {
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-            out_[dst_data]   = inA[src];
-            out_[dst_data+1] = inA[src+1];
-        }
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_scale.comp DELETED Viewed

@@ -1,19 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-    float scale;
-} pcs;
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
-}

ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp DELETED Viewed

@@ -1,23 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-    float scale;
-} pcs;
-void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 8;
-    for (uint x = 0; x < 8; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_silu.comp DELETED Viewed

@@ -1,22 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-} pcs;
-void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
-    for (uint x = 0; x < 4; x++) {
-        const uint i = baseIndex + x;
-        const float y = in_[i + pcs.inOff];
-        out_[i + pcs.outOff] = y / (1.0 + exp(-y));
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp DELETED Viewed

@@ -1,72 +0,0 @@
-// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
-#version 450
-#include "common.comp"
-layout(local_size_x_id = 0) in;
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    float scale;
-    float max_bias;
-    float m0;
-    float m1;
-    uint n_head_log2;
-    int mask;
-} pcs;
-void main() {
-    if (gl_SubgroupInvocationID > 31)
-        return;
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-    const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
-    const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
-    const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
-    const uint pdst = extra_off + pcs.outOff; // Based from out_
-    float slope = 1.0f;
-    // ALiBi
-    if (pcs.max_bias > 0.0f) {
-        int64_t h = i02;
-        float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
-        int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
-        slope = pow(base, float(exp));
-    }
-    // parallel max
-    float localMax = uintBitsToFloat(0xFF800000);
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
-    }
-    float max_ = subgroupMax(localMax);
-    // parallel sum
-    float localSum = 0.0f;
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
-        localSum += exp_psrc0;
-        out_[pdst + i00] = exp_psrc0;
-    }
-    const float sum = subgroupAdd(localSum);
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        out_[pdst + i00] /= sum;
-    }
-}

ggml/src/ggml-kompute/kompute-shaders/rope_common.comp DELETED Viewed

@@ -1,71 +0,0 @@
-#include "common.comp"
-#define GGML_ROPE_TYPE_NEOX 2
-// TODO: use a local size of 32 or more (Metal uses 1024)
-layout(local_size_x = 1) in;
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint inCOff;
-    uint outOff;
-    int n_dims;
-    int mode;
-    int n_ctx_orig;
-    float freq_base;
-    float freq_scale;
-    bool has_freq_factors;
-    float ext_factor;
-    float attn_factor;
-    float beta_fast;
-    float beta_slow;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-float rope_yarn_ramp(const float low, const float high, const float i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
-    out float cos_theta, out float sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
-    }
-    cos_theta = cos(theta) * mscale;
-    sin_theta = sin(theta) * mscale;
-}
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
-    return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base));
-}
-void rope_yarn_corr_dims(
-    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2]
-) {
-    // start and end correction dims
-    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
-    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
-}