ggerganov commited on
Commit
cbbfa9e
·
unverified ·
1 Parent(s): 15949a9

sync : ggml (#2001)

Browse files

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build

This view is limited to 50 files because it contains too many changes.   See raw diff
.github/workflows/build.yml CHANGED
@@ -152,13 +152,13 @@ jobs:
152
 
153
  ubuntu-22-cmake-sycl:
154
  runs-on: ubuntu-22.04
155
-
156
  strategy:
157
  fail-fast: false
158
  matrix:
159
  dwhisper_sycl: [ON]
160
  dcmake_c_compiler: [icx]
161
- dcmake_cxx_compiler: [icpx]
162
  arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
163
 
164
  continue-on-error: true
@@ -166,7 +166,7 @@ jobs:
166
  steps:
167
  - name: Clone
168
  uses: actions/checkout@v3
169
-
170
  - name: add oneAPI to apt
171
  shell: bash
172
  run: |
@@ -190,7 +190,7 @@ jobs:
190
  - name: Clone
191
  id: checkout
192
  uses: actions/checkout@v3
193
-
194
  - name: Build
195
  id: cmake_build
196
  run: |
@@ -202,13 +202,13 @@ jobs:
202
 
203
  ubuntu-22-cmake-sycl-fp16:
204
  runs-on: ubuntu-22.04
205
-
206
  strategy:
207
  fail-fast: false
208
  matrix:
209
  dwhisper_sycl: [ON]
210
  dcmake_c_compiler: [icx]
211
- dcmake_cxx_compiler: [icpx]
212
  arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
213
 
214
  continue-on-error: true
@@ -216,7 +216,7 @@ jobs:
216
  steps:
217
  - name: Clone
218
  uses: actions/checkout@v3
219
-
220
  - name: add oneAPI to apt
221
  shell: bash
222
  run: |
@@ -240,7 +240,7 @@ jobs:
240
  - name: Clone
241
  id: checkout
242
  uses: actions/checkout@v3
243
-
244
  - name: Build
245
  id: cmake_build
246
  run: |
@@ -249,7 +249,7 @@ jobs:
249
  cd build
250
  cmake -DWHISPER_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
251
  cmake --build . --config Release -j $(nproc)
252
-
253
  windows:
254
  runs-on: windows-latest
255
 
 
152
 
153
  ubuntu-22-cmake-sycl:
154
  runs-on: ubuntu-22.04
155
+
156
  strategy:
157
  fail-fast: false
158
  matrix:
159
  dwhisper_sycl: [ON]
160
  dcmake_c_compiler: [icx]
161
+ dcmake_cxx_compiler: [icpx]
162
  arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
163
 
164
  continue-on-error: true
 
166
  steps:
167
  - name: Clone
168
  uses: actions/checkout@v3
169
+
170
  - name: add oneAPI to apt
171
  shell: bash
172
  run: |
 
190
  - name: Clone
191
  id: checkout
192
  uses: actions/checkout@v3
193
+
194
  - name: Build
195
  id: cmake_build
196
  run: |
 
202
 
203
  ubuntu-22-cmake-sycl-fp16:
204
  runs-on: ubuntu-22.04
205
+
206
  strategy:
207
  fail-fast: false
208
  matrix:
209
  dwhisper_sycl: [ON]
210
  dcmake_c_compiler: [icx]
211
+ dcmake_cxx_compiler: [icpx]
212
  arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
213
 
214
  continue-on-error: true
 
216
  steps:
217
  - name: Clone
218
  uses: actions/checkout@v3
219
+
220
  - name: add oneAPI to apt
221
  shell: bash
222
  run: |
 
240
  - name: Clone
241
  id: checkout
242
  uses: actions/checkout@v3
243
+
244
  - name: Build
245
  id: cmake_build
246
  run: |
 
249
  cd build
250
  cmake -DWHISPER_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
251
  cmake --build . --config Release -j $(nproc)
252
+
253
  windows:
254
  runs-on: windows-latest
255
 
CMakeLists.txt CHANGED
@@ -74,7 +74,8 @@ else()
74
  option(WHISPER_BLAS "whisper: use BLAS libraries" OFF)
75
  option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic)
76
  option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF)
77
- option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
 
78
  option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF)
79
  option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)
80
  option(WHISPER_SYCL "whisper: use SYCL" OFF)
@@ -240,6 +241,11 @@ if (WHISPER_BLAS)
240
  endif ()
241
 
242
  if (WHISPER_CUBLAS)
 
 
 
 
 
243
  cmake_minimum_required(VERSION 3.17)
244
 
245
  find_package(CUDAToolkit)
@@ -249,9 +255,11 @@ if (WHISPER_CUBLAS)
249
 
250
  enable_language(CUDA)
251
 
252
- set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
 
 
253
 
254
- add_compile_definitions(GGML_USE_CUBLAS)
255
 
256
  if (WHISPER_STATIC)
257
  if (WIN32)
@@ -286,7 +294,7 @@ if (WHISPER_HIPBLAS)
286
 
287
  if (${hipblas_FOUND} AND ${hip_FOUND})
288
  message(STATUS "HIP and hipBLAS found")
289
- add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
290
  add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
291
  set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
292
  set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
 
74
  option(WHISPER_BLAS "whisper: use BLAS libraries" OFF)
75
  option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic)
76
  option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF)
77
+ option(WHISPER_CUDA "whisper: support for CUDA" OFF)
78
+ option(WHISPER_CUBLAS "whisper: support for CUDA (deprecated)" OFF)
79
  option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF)
80
  option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)
81
  option(WHISPER_SYCL "whisper: use SYCL" OFF)
 
241
  endif ()
242
 
243
  if (WHISPER_CUBLAS)
244
+ message(WARNING "WHISPER_CUBLAS is deprecated and will be removed in the future.\nUse WHISPER_CUDA instead")
245
+ set(WHISPER_CUDA ON)
246
+ endif()
247
+
248
+ if (WHISPER_CUDA)
249
  cmake_minimum_required(VERSION 3.17)
250
 
251
  find_package(CUDAToolkit)
 
255
 
256
  enable_language(CUDA)
257
 
258
+ file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
259
+ list(APPEND GGML_SOURCES_CUDA ggml-cuda.h)
260
+ list(APPEND GGML_SOURCES_CUDA ggml-cuda.cu)
261
 
262
+ add_compile_definitions(GGML_USE_CUDA)
263
 
264
  if (WHISPER_STATIC)
265
  if (WIN32)
 
294
 
295
  if (${hipblas_FOUND} AND ${hip_FOUND})
296
  message(STATUS "HIP and hipBLAS found")
297
+ add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
298
  add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
299
  set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
300
  set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
Makefile CHANGED
@@ -216,20 +216,29 @@ ifdef WHISPER_OPENBLAS
216
  endif
217
 
218
  ifdef WHISPER_CUBLAS
 
 
 
 
 
219
  ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
220
  CUDA_ARCH_FLAG ?= native
221
  else
222
  CUDA_ARCH_FLAG ?= all
223
  endif
224
 
225
- CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
226
- CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
227
  LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
228
  WHISPER_OBJ += ggml-cuda.o
 
229
  NVCC = nvcc
230
  NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
231
 
232
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 
 
 
233
  $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
234
  endif
235
 
@@ -237,14 +246,18 @@ ifdef WHISPER_HIPBLAS
237
  ROCM_PATH ?= /opt/rocm
238
  HIPCC ?= $(ROCM_PATH)/bin/hipcc
239
  GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
240
- CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
241
- CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
242
  LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
243
  LDFLAGS += -lhipblas -lamdhip64 -lrocblas
244
  HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
245
  WHISPER_OBJ += ggml-cuda.o
 
 
 
 
246
 
247
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
248
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
249
  endif
250
 
@@ -309,6 +322,13 @@ $(info I CC: $(CCV))
309
  $(info I CXX: $(CXXV))
310
  $(info )
311
 
 
 
 
 
 
 
 
312
  #
313
  # Build library
314
  #
@@ -410,8 +430,8 @@ lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
410
  talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
411
  $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
412
 
413
- talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
414
- $(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
415
 
416
  #
417
  # Audio samples
 
216
  endif
217
 
218
  ifdef WHISPER_CUBLAS
219
+ # WHISPER_CUBLAS is deprecated and will be removed in the future
220
+ WHISPER_CUDA := 1
221
+ endif
222
+
223
+ ifdef WHISPER_CUDA
224
  ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
225
  CUDA_ARCH_FLAG ?= native
226
  else
227
  CUDA_ARCH_FLAG ?= all
228
  endif
229
 
230
+ CFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
231
+ CXXFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
232
  LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
233
  WHISPER_OBJ += ggml-cuda.o
234
+ WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
235
  NVCC = nvcc
236
  NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
237
 
238
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
239
+ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
240
+
241
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
242
  $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
243
  endif
244
 
 
246
  ROCM_PATH ?= /opt/rocm
247
  HIPCC ?= $(ROCM_PATH)/bin/hipcc
248
  GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
249
+ CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
250
+ CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
251
  LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
252
  LDFLAGS += -lhipblas -lamdhip64 -lrocblas
253
  HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
254
  WHISPER_OBJ += ggml-cuda.o
255
+ WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
256
+
257
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
258
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
259
 
260
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
261
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
262
  endif
263
 
 
322
  $(info I CXX: $(CXXV))
323
  $(info )
324
 
325
+ ifdef WHISPER_CUBLAS
326
+ $(info !!!!)
327
+ $(info WHISPER_CUBLAS is deprecated and will be removed in the future. Use WHISPER_CUDA instead.)
328
+ $(info !!!!)
329
+ $(info )
330
+ endif
331
+
332
  #
333
  # Build library
334
  #
 
430
  talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
431
  $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
432
 
433
+ talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
434
+ $(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
435
 
436
  #
437
  # Audio samples
README.md CHANGED
@@ -414,11 +414,11 @@ For more information about the Core ML implementation please refer to PR [#1037]
414
  With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
415
  First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
416
 
417
- Now build `whisper.cpp` with cuBLAS support:
418
 
419
  ```
420
  make clean
421
- WHISPER_CUBLAS=1 make -j
422
  ```
423
 
424
  ## OpenCL GPU support via CLBlast
 
414
  With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
415
  First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
416
 
417
+ Now build `whisper.cpp` with CUDA support:
418
 
419
  ```
420
  make clean
421
+ WHISPER_CUDA=1 make -j
422
  ```
423
 
424
  ## OpenCL GPU support via CLBlast
examples/common-ggml.cpp CHANGED
@@ -70,6 +70,7 @@ bool ggml_common_quantize_0(
70
  case GGML_FTYPE_MOSTLY_IQ1_S:
71
  case GGML_FTYPE_MOSTLY_IQ4_NL:
72
  case GGML_FTYPE_MOSTLY_IQ4_XS:
 
73
  {
74
  fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
75
  return false;
@@ -193,6 +194,8 @@ bool ggml_common_quantize_0(
193
  case GGML_TYPE_I8:
194
  case GGML_TYPE_I16:
195
  case GGML_TYPE_I32:
 
 
196
  case GGML_TYPE_Q8_1:
197
  case GGML_TYPE_Q8_K:
198
  case GGML_TYPE_IQ2_XXS:
@@ -203,6 +206,7 @@ bool ggml_common_quantize_0(
203
  case GGML_TYPE_IQ1_S:
204
  case GGML_TYPE_IQ4_NL:
205
  case GGML_TYPE_IQ4_XS:
 
206
  case GGML_TYPE_COUNT:
207
  {
208
  fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
 
70
  case GGML_FTYPE_MOSTLY_IQ1_S:
71
  case GGML_FTYPE_MOSTLY_IQ4_NL:
72
  case GGML_FTYPE_MOSTLY_IQ4_XS:
73
+ case GGML_FTYPE_MOSTLY_IQ1_M:
74
  {
75
  fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
76
  return false;
 
194
  case GGML_TYPE_I8:
195
  case GGML_TYPE_I16:
196
  case GGML_TYPE_I32:
197
+ case GGML_TYPE_I64:
198
+ case GGML_TYPE_F64:
199
  case GGML_TYPE_Q8_1:
200
  case GGML_TYPE_Q8_K:
201
  case GGML_TYPE_IQ2_XXS:
 
206
  case GGML_TYPE_IQ1_S:
207
  case GGML_TYPE_IQ4_NL:
208
  case GGML_TYPE_IQ4_XS:
209
+ case GGML_TYPE_IQ1_M:
210
  case GGML_TYPE_COUNT:
211
  {
212
  fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
examples/talk-llama/CMakeLists.txt CHANGED
@@ -1,7 +1,7 @@
1
  if (WHISPER_SDL2)
2
  # talk-llama
3
  set(TARGET talk-llama)
4
- add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp)
5
  target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
6
 
7
  if (WHISPER_CLBLAST)
 
1
  if (WHISPER_SDL2)
2
  # talk-llama
3
  set(TARGET talk-llama)
4
+ add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
5
  target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
6
 
7
  if (WHISPER_CLBLAST)
examples/talk-llama/llama.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama.h CHANGED
@@ -39,7 +39,7 @@
39
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
40
 
41
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
42
- #define LLAMA_SESSION_VERSION 4
43
 
44
  #ifdef __cplusplus
45
  extern "C" {
@@ -117,6 +117,7 @@ extern "C" {
117
  LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
118
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
119
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
 
120
 
121
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
122
  };
@@ -275,13 +276,16 @@ extern "C" {
275
 
276
  // model quantization parameters
277
  typedef struct llama_model_quantize_params {
278
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
279
- enum llama_ftype ftype; // quantize to this llama_ftype
280
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
281
- bool quantize_output_tensor; // quantize output.weight
282
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
283
- bool pure; // quantize all tensors to the default type
284
- void * imatrix; // pointer to importance matrix data
 
 
 
285
  } llama_model_quantize_params;
286
 
287
  // grammar types
@@ -388,6 +392,7 @@ extern "C" {
388
  LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
389
  LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
390
  LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
 
391
 
392
  // Get the model's RoPE frequency scaling factor
393
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@@ -435,10 +440,24 @@ extern "C" {
435
  // Returns 0 on success
436
  LLAMA_API int32_t llama_model_apply_lora_from_file(
437
  const struct llama_model * model,
438
- const char * path_lora,
439
- float scale,
440
- const char * path_base_model,
441
- int32_t n_threads);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
 
443
  //
444
  // KV cache
@@ -659,23 +678,29 @@ extern "C" {
659
  LLAMA_API void llama_synchronize(struct llama_context * ctx);
660
 
661
  // Token logits obtained from the last call to llama_decode()
662
- // The logits for the last token are stored in the last row
663
- // Logits for which llama_batch.logits[i] == 0 are undefined
664
- // Rows: n_tokens provided with llama_batch
665
  // Cols: n_vocab
666
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
667
 
668
  // Logits for the ith token. Equivalent to:
669
- // llama_get_logits(ctx) + i*n_vocab
 
670
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
671
 
672
- // Get all output token embeddings
673
- // shape: [n_tokens*n_embd] (1-dimensional)
 
 
 
 
674
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
675
 
676
- // Get the embeddings for the ith token
677
- // llama_get_embeddings(ctx) + i*n_embd
678
  // shape: [n_embd] (1-dimensional)
 
679
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
680
 
681
  // Get the embeddings for a sequence id
@@ -945,6 +970,16 @@ extern "C" {
945
  int32_t n_past,
946
  int32_t n_predict);
947
 
 
 
 
 
 
 
 
 
 
 
948
  // Performance information
949
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
950
 
 
39
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
40
 
41
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
42
+ #define LLAMA_SESSION_VERSION 5
43
 
44
  #ifdef __cplusplus
45
  extern "C" {
 
117
  LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
118
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
119
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
120
+ LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
121
 
122
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
123
  };
 
276
 
277
  // model quantization parameters
278
  typedef struct llama_model_quantize_params {
279
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
280
+ enum llama_ftype ftype; // quantize to this llama_ftype
281
+ enum ggml_type output_tensor_type; // output tensor type
282
+ enum ggml_type token_embedding_type; // itoken embeddings tensor type
283
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
284
+ bool quantize_output_tensor; // quantize output.weight
285
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
286
+ bool pure; // quantize all tensors to the default type
287
+ void * imatrix; // pointer to importance matrix data
288
+ void * kv_overrides; // pointer to vector containing overrides
289
  } llama_model_quantize_params;
290
 
291
  // grammar types
 
392
  LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
393
  LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
394
  LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
395
+ LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
396
 
397
  // Get the model's RoPE frequency scaling factor
398
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
 
440
  // Returns 0 on success
441
  LLAMA_API int32_t llama_model_apply_lora_from_file(
442
  const struct llama_model * model,
443
+ const char * path_lora,
444
+ float scale,
445
+ const char * path_base_model,
446
+ int32_t n_threads);
447
+
448
+ // Apply a loaded control vector to a llama_context, or if data is NULL, clear
449
+ // the currently loaded vector.
450
+ // n_embd should be the size of a single layer's control, and data should point
451
+ // to an n_embd x n_layers buffer starting from layer 1.
452
+ // il_start and il_end are the layer range the vector should apply to (both inclusive)
453
+ // See llama_control_vector_load in common to load a control vector.
454
+ LLAMA_API int32_t llama_control_vector_apply(
455
+ struct llama_context * lctx,
456
+ const float * data,
457
+ size_t len,
458
+ int32_t n_embd,
459
+ int32_t il_start,
460
+ int32_t il_end);
461
 
462
  //
463
  // KV cache
 
678
  LLAMA_API void llama_synchronize(struct llama_context * ctx);
679
 
680
  // Token logits obtained from the last call to llama_decode()
681
+ // The logits for which llama_batch.logits[i] != 0 are stored contiguously
682
+ // in the order they have appeared in the batch.
683
+ // Rows: number of tokens for which llama_batch.logits[i] != 0
684
  // Cols: n_vocab
685
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
686
 
687
  // Logits for the ith token. Equivalent to:
688
+ // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
689
+ // returns NULL for invalid ids.
690
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
691
 
692
+ // Get all output token embeddings.
693
+ // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
694
+ // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
695
+ // in the order they have appeared in the batch.
696
+ // shape: [n_outputs*n_embd]
697
+ // Otherwise, returns NULL.
698
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
699
 
700
+ // Get the embeddings for the ith token. Equivalent to:
701
+ // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
702
  // shape: [n_embd] (1-dimensional)
703
+ // returns NULL for invalid ids.
704
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
705
 
706
  // Get the embeddings for a sequence id
 
970
  int32_t n_past,
971
  int32_t n_predict);
972
 
973
+ /// @details Build a split GGUF final path for this chunk.
974
+ /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
975
+ // Returns the split_path length.
976
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
977
+
978
+ /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
979
+ /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
980
+ // Returns the split_prefix length.
981
+ LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
982
+
983
  // Performance information
984
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
985
 
examples/talk-llama/unicode-data.cpp ADDED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/unicode-data.h ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+ #include <map>
5
+ #include <utility>
6
+ #include <vector>
7
+
8
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
9
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
10
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
11
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
12
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
13
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
14
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
15
+ extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
16
+ extern const std::map<char32_t, char32_t> unicode_map_lowercase;
examples/talk-llama/unicode.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/unicode.h CHANGED
@@ -24,3 +24,5 @@ int unicode_cpt_type(const std::string & utf8);
24
  std::string unicode_byte_to_utf8(uint8_t byte);
25
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
26
 
 
 
 
24
  std::string unicode_byte_to_utf8(uint8_t byte);
25
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
26
 
27
+ // simple tolower that only implements one-to-one mapping, not one-to-many
28
+ char32_t unicode_tolower(char32_t cp);
examples/wchess/CMakeLists.txt CHANGED
@@ -1,5 +1,3 @@
1
- set(CMAKE_CXX_STANDARD 11)
2
-
3
  add_subdirectory(libwchess)
4
 
5
  if (EMSCRIPTEN)
 
 
 
1
  add_subdirectory(libwchess)
2
 
3
  if (EMSCRIPTEN)
extra/sync-ggml-am.sh CHANGED
@@ -98,6 +98,7 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
98
  # src/ggml-backend-impl.h -> ggml-backend-impl.h
99
  # src/ggml-backend.c -> ggml-backend.c
100
  # src/ggml-common.h -> ggml-common.h
 
101
  # src/ggml-cuda.cu -> ggml-cuda.cu
102
  # src/ggml-cuda.h -> ggml-cuda.h
103
  # src/ggml-impl.h -> ggml-impl.h
@@ -135,6 +136,7 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
135
  -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
136
  -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
137
  -e 's/src\/ggml-common\.h/ggml-common.h/g' \
 
138
  -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
139
  -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
140
  -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
 
98
  # src/ggml-backend-impl.h -> ggml-backend-impl.h
99
  # src/ggml-backend.c -> ggml-backend.c
100
  # src/ggml-common.h -> ggml-common.h
101
+ # src/ggml-cuda/* -> ggml-cuda/
102
  # src/ggml-cuda.cu -> ggml-cuda.cu
103
  # src/ggml-cuda.h -> ggml-cuda.h
104
  # src/ggml-impl.h -> ggml-impl.h
 
136
  -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
137
  -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
138
  -e 's/src\/ggml-common\.h/ggml-common.h/g' \
139
+ -e 's/src\/ggml-cuda\//ggml-cuda\//g' \
140
  -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
141
  -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
142
  -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
extra/sync-ggml.sh CHANGED
@@ -6,6 +6,7 @@ cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c
6
  cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h
7
  cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c
8
  cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h
 
9
  cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
10
  cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
11
  cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml-kompute.cpp
 
6
  cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h
7
  cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c
8
  cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h
9
+ cp -rpv ../ggml/src/ggml-cuda/* ./ggml-cuda/
10
  cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
11
  cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
12
  cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml-kompute.cpp
extra/sync-llama.sh CHANGED
@@ -1,6 +1,8 @@
1
  #!/bin/bash
2
 
3
- cp -rpv ../llama.cpp/llama.h ./examples/talk-llama/llama.h
4
- cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
5
- cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
6
- cp -rpv ../llama.cpp/unicode.cpp ./examples/talk-llama/unicode.cpp
 
 
 
1
  #!/bin/bash
2
 
3
+ cp -rpv ../llama.cpp/llama.h ./examples/talk-llama/llama.h
4
+ cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
5
+ cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
6
+ cp -rpv ../llama.cpp/unicode.cpp ./examples/talk-llama/unicode.cpp
7
+ cp -rpv ../llama.cpp/unicode-data.h ./examples/talk-llama/unicode-data.h
8
+ cp -rpv ../llama.cpp/unicode-data.cpp ./examples/talk-llama/unicode-data.cpp
ggml-alloc.c CHANGED
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
548
  for (int i = 0; i < graph->n_nodes; i++) {
549
  struct ggml_tensor * node = graph->nodes[i];
550
 
551
- if (ggml_is_view(node)) {
 
 
 
 
552
  struct ggml_tensor * view_src = node->view_src;
553
  ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
554
  }
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
565
 
566
  ggml_gallocr_hash_get(galloc, src)->n_children += 1;
567
 
568
- // allocate explicit inputs and leafs
569
- if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
570
  ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
571
  }
572
  }
 
548
  for (int i = 0; i < graph->n_nodes; i++) {
549
  struct ggml_tensor * node = graph->nodes[i];
550
 
551
+ // TODO: better way to add external dependencies
552
+ // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
553
+ // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
554
+ // itself is never used and should not be considered a dependency
555
+ if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
556
  struct ggml_tensor * view_src = node->view_src;
557
  ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
558
  }
 
569
 
570
  ggml_gallocr_hash_get(galloc, src)->n_children += 1;
571
 
572
+ // allocate explicit inputs
573
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
574
  ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
575
  }
576
  }
ggml-backend-impl.h CHANGED
@@ -103,6 +103,11 @@ extern "C" {
103
  // check if the backend supports an operation
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
 
 
 
 
 
 
106
  // (optional) event synchronization
107
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
  void (*GGML_CALL event_free) (ggml_backend_event_t event);
 
103
  // check if the backend supports an operation
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
 
106
+ // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
+ // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
+ // even if the weight has to be copied from the CPU temporarily
109
+ bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
+
111
  // (optional) event synchronization
112
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
113
  void (*GGML_CALL event_free) (ggml_backend_event_t event);
ggml-backend.c CHANGED
@@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
278
  return err;
279
  }
280
 
281
- bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
282
  return backend->iface.graph_compute(backend, cgraph);
283
  }
284
 
@@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
286
  return backend->iface.supports_op(backend, op);
287
  }
288
 
 
 
 
 
 
 
 
289
  // backend copy
290
 
291
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -413,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
413
  ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
414
 
415
  // add forward decls here to avoid including the backend headers
416
- #ifdef GGML_USE_CUBLAS
417
  extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
418
  ggml_backend_cuda_reg_devices();
419
  #endif
@@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
761
 
762
  if (cpu_plan->cplan.work_size > 0) {
763
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
 
 
 
 
764
  }
765
 
766
  cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
@@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
834
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
835
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
836
  /* .supports_op = */ ggml_backend_cpu_supports_op,
 
837
  /* .event_new = */ NULL,
838
  /* .event_free = */ NULL,
839
  /* .event_record = */ NULL,
@@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
999
  #endif
1000
 
1001
  #ifndef GGML_SCHED_MAX_SPLITS
1002
- #define GGML_SCHED_MAX_SPLITS 256
1003
  #endif
1004
 
1005
  #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1006
- #define GGML_SCHED_MAX_SPLIT_INPUTS 16
1007
  #endif
1008
 
1009
  #ifndef GGML_SCHED_MAX_COPIES
@@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
1043
  struct ggml_cgraph * graph;
1044
 
1045
  // graph splits
1046
- struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
1047
  int n_splits;
 
1048
 
1049
  // pipeline parallelism support
1050
  int n_copies;
@@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1114
  // TODO: use supports_op to check if the backend supports the op
1115
 
1116
  // assign pre-allocated nodes to their backend
1117
- // dst
1118
- int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
1119
- if (cur_backend != -1) {
1120
  SET_CAUSE(tensor, "1.dst");
1121
- return cur_backend;
1122
  }
1123
 
1124
  // view_src
1125
  if (tensor->view_src != NULL) {
1126
- cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1127
- if (cur_backend != -1) {
1128
  SET_CAUSE(tensor, "1.vsrc");
1129
- return cur_backend;
1130
  }
1131
  }
1132
 
1133
- // input
1134
  if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1135
- cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
1136
  SET_CAUSE(tensor, "1.inp");
1137
- return cur_backend;
1138
  }
1139
 
1140
  // assign nodes that use weights to the backend of the weights
 
1141
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1142
  const struct ggml_tensor * src = tensor->src[i];
1143
  if (src == NULL) {
1144
  continue;
1145
  }
1146
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1147
- int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
1148
- // operations with weights are always run on the same backend as the weights
 
 
 
 
 
 
 
 
1149
  SET_CAUSE(tensor, "1.wgt%d", i);
1150
- return src_backend;
1151
  }
1152
  }
1153
 
@@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1227
  // pass 1: assign backends to ops with pre-allocated inputs
1228
  for (int i = 0; i < graph->n_leafs; i++) {
1229
  struct ggml_tensor * leaf = graph->leafs[i];
1230
- if (tensor_backend_id(leaf) != -1) {
 
1231
  // do not overwrite user assignments
1232
  continue;
1233
  }
1234
- tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1235
  }
1236
 
1237
  for (int i = 0; i < graph->n_nodes; i++) {
1238
  struct ggml_tensor * node = graph->nodes[i];
1239
- if (tensor_backend_id(node) != -1) {
 
1240
  // do not overwrite user assignments
1241
  continue;
1242
  }
1243
- tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
1244
  // src
1245
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1246
  struct ggml_tensor * src = node->src[j];
1247
  if (src == NULL) {
1248
  continue;
1249
  }
1250
- if (tensor_backend_id(src) == -1) {
1251
- tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
 
1252
  }
1253
  }
1254
  }
@@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1270
  if (ggml_is_view_op(node->op)) {
1271
  continue;
1272
  }
1273
- int tensor_backend_id = tensor_backend_id(node);
1274
- if (tensor_backend_id != -1) {
1275
- if (tensor_backend_id == sched->n_backends - 1) {
1276
  // skip cpu (lowest prio backend)
1277
  cur_backend_id = -1;
1278
  } else {
1279
- cur_backend_id = tensor_backend_id;
1280
  }
1281
  } else {
1282
- tensor_backend_id(node) = cur_backend_id;
1283
  SET_CAUSE(node, "2.2");
1284
  }
1285
  }
1286
  }
1287
-
1288
  // pass 2.1 expand gpu up
1289
  {
1290
  int cur_backend_id = -1;
@@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1293
  if (ggml_is_view_op(node->op)) {
1294
  continue;
1295
  }
1296
- int tensor_backend_id = tensor_backend_id(node);
1297
- if (tensor_backend_id != -1) {
1298
- if (tensor_backend_id == sched->n_backends - 1) {
1299
  // skip cpu (lowest prio backend)
1300
  cur_backend_id = -1;
1301
  } else {
1302
- cur_backend_id = tensor_backend_id;
1303
  }
1304
  } else {
1305
- tensor_backend_id(node) = cur_backend_id;
1306
  SET_CAUSE(node, "2.1");
1307
  }
1308
  }
1309
  }
1310
-
1311
-
1312
  // pass 2.4 expand rest down
1313
  {
1314
  int cur_backend_id = -1;
@@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1317
  if (ggml_is_view_op(node->op)) {
1318
  continue;
1319
  }
1320
- int tensor_backend_id = tensor_backend_id(node);
1321
- if (tensor_backend_id != -1) {
1322
- cur_backend_id = tensor_backend_id;
1323
  } else {
1324
- tensor_backend_id(node) = cur_backend_id;
1325
  SET_CAUSE(node, "2.4");
1326
  }
1327
  }
1328
  }
1329
- // pass 2.3 expand rest up
1330
  {
1331
  int cur_backend_id = -1;
1332
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1334
  if (ggml_is_view_op(node->op)) {
1335
  continue;
1336
  }
1337
- int tensor_backend_id = tensor_backend_id(node);
1338
- if (tensor_backend_id != -1) {
1339
- cur_backend_id = tensor_backend_id;
1340
  } else {
1341
- tensor_backend_id(node) = cur_backend_id;
1342
  SET_CAUSE(node, "2.3");
1343
  }
1344
  }
@@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1351
  // pass 3: assign backends to remaining src from dst and view_src
1352
  for (int i = 0; i < graph->n_nodes; i++) {
1353
  struct ggml_tensor * node = graph->nodes[i];
1354
- int cur_backend_id = tensor_backend_id(node);
1355
- if (node->view_src != NULL && cur_backend_id == -1) {
1356
- cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
1357
  SET_CAUSE(node, "3.vsrc");
1358
  }
1359
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1361
  if (src == NULL) {
1362
  continue;
1363
  }
1364
- int src_backend_id = tensor_backend_id(src);
1365
- if (src_backend_id == -1) {
1366
  if (src->view_src != NULL) {
1367
  // views are always on the same backend as the source
1368
- tensor_backend_id(src) = tensor_backend_id(src->view_src);
1369
  SET_CAUSE(src, "3.vsrc");
1370
  } else {
1371
- tensor_backend_id(src) = cur_backend_id;
1372
  SET_CAUSE(src, "3.cur");
1373
  }
1374
  }
@@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1380
 
1381
  // pass 4: split graph, find tensors that need to be copied
1382
  {
1383
- int cur_split = 0;
 
1384
  // find the backend of the first split, skipping view ops
1385
  for (int i = 0; i < graph->n_nodes; i++) {
1386
  struct ggml_tensor * node = graph->nodes[i];
1387
  if (!ggml_is_view_op(node->op)) {
1388
- sched->splits[0].backend_id = tensor_backend_id(node);
1389
  break;
1390
  }
1391
  }
1392
- sched->splits[0].i_start = 0;
1393
- sched->splits[0].n_inputs = 0;
1394
- memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1395
- int cur_backend_id = sched->splits[0].backend_id;
1396
  for (int i = 0; i < graph->n_nodes; i++) {
1397
  struct ggml_tensor * node = graph->nodes[i];
1398
 
@@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1400
  continue;
1401
  }
1402
 
1403
- int tensor_backend_id = tensor_backend_id(node);
1404
 
1405
- GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
1406
 
1407
- if (tensor_backend_id != cur_backend_id) {
1408
- sched->splits[cur_split].i_end = i;
1409
- cur_split++;
1410
- GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
1411
- sched->splits[cur_split].backend_id = tensor_backend_id;
1412
- sched->splits[cur_split].i_start = i;
1413
- sched->splits[cur_split].n_inputs = 0;
1414
- cur_backend_id = tensor_backend_id;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1415
  }
1416
 
1417
  // find inputs that are not on the same backend
@@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1421
  continue;
1422
  }
1423
 
1424
- int src_backend_id = tensor_backend_id(src);
1425
  assert(src_backend_id != -1); // all inputs should be assigned by now
1426
 
1427
- if (src->flags & GGML_TENSOR_FLAG_INPUT) {
1428
  size_t id = hash_id(src);
1429
  if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1430
  ggml_backend_t backend = sched->backends[src_backend_id];
@@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1441
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1442
  }
1443
  sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1444
- tensor_backend_id(tensor_copy) = src_backend_id;
1445
  SET_CAUSE(tensor_copy, "4.cpy");
1446
  }
1447
  int n_graph_inputs = sched->n_graph_inputs++;
@@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1450
  }
1451
  }
1452
 
1453
- if (src_backend_id != tensor_backend_id) {
1454
  // create a copy of the input in the split's backend
1455
- size_t id = hash_id(src);
1456
  if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1457
  ggml_backend_t backend = sched->backends[cur_backend_id];
1458
  for (int c = 0; c < sched->n_copies; c++) {
@@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1463
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1464
  }
1465
  sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1466
- tensor_backend_id(tensor_copy) = cur_backend_id;
1467
  SET_CAUSE(tensor_copy, "4.cpy");
1468
  }
1469
- int n_inputs = sched->splits[cur_split].n_inputs++;
1470
  GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1471
- sched->splits[cur_split].inputs[n_inputs] = src;
1472
  }
1473
  node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1474
  }
1475
  }
1476
  }
1477
- sched->splits[cur_split].i_end = graph->n_nodes;
1478
- sched->n_splits = cur_split + 1;
1479
  }
1480
  #ifdef DEBUG_PASS4
1481
  fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1482
  #endif
1483
 
1484
- #ifndef NDEBUG
1485
- // sanity check: all sources should have the same backend as the node
1486
- for (int i = 0; i < graph->n_nodes; i++) {
1487
- struct ggml_tensor * node = graph->nodes[i];
1488
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1489
- if (tensor_backend == NULL) {
1490
- fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1491
- }
1492
- if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
1493
- fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1494
- node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1495
- node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
1496
- ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
1497
- }
1498
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1499
- struct ggml_tensor * src = node->src[j];
1500
- if (src == NULL) {
1501
- continue;
1502
- }
1503
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1504
- if (src_backend != tensor_backend /* && src_backend != NULL */) {
1505
- fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1506
- node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1507
- j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1508
- }
1509
- if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
1510
- fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1511
- src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1512
- src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
1513
- ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
1514
- }
1515
- }
1516
- }
1517
- fflush(stderr);
1518
- #endif
1519
-
1520
  // create copies of the graph for each split
1521
  // TODO: avoid this copy
1522
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
1523
  for (int i = 0; i < sched->n_splits; i++) {
1524
  struct ggml_backend_sched_split * split = &sched->splits[i];
1525
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1526
 
1527
  // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1528
  for (int j = 0; j < split->n_inputs; j++) {
 
 
1529
  struct ggml_tensor * input = split->inputs[j];
1530
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
 
1531
 
1532
  // add a dependency to the input source so that it is not freed before the copy is done
1533
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1534
  input_dep->src[0] = input;
1535
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1536
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1537
 
1538
  // add a dependency to the input copy so that it is allocated at the start of the split
@@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1541
  }
1542
 
1543
  for (int j = split->i_start; j < split->i_end; j++) {
 
1544
  sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1545
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1546
  }
@@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1625
  }
1626
  ggml_backend_tensor_copy(input, input_cpy);
1627
  } else {
 
1628
  if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1629
  ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1630
  } else {
1631
  ggml_backend_synchronize(split_backend);
1632
- ggml_backend_synchronize(input_backend);
1633
  }
1634
-
1635
  ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1636
  }
1637
  }
@@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
1701
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1702
 
1703
  // initialize hash table
1704
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1705
  sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1706
  sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1707
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1708
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
 
 
1709
 
1710
  sched->n_backends = n_backends;
1711
 
1712
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1713
 
1714
- GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
 
 
1715
 
1716
  for (int b = 0; b < n_backends; b++) {
1717
  sched->backends[b] = backends[b];
@@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1742
  }
1743
  ggml_gallocr_free(sched->galloc);
1744
  ggml_free(sched->ctx);
 
1745
  free(sched->hash_set.keys);
1746
  free(sched->tensor_backend_id);
1747
  free(sched->tensor_copies);
@@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1762
  }
1763
 
1764
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
 
 
1765
  ggml_backend_sched_split_graph(sched, measure_graph);
1766
 
1767
  // TODO: extract this to a separate function
@@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1776
  }
1777
 
1778
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1779
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1780
 
1781
  ggml_backend_sched_split_graph(sched, graph);
1782
 
 
278
  return err;
279
  }
280
 
281
+ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
282
  return backend->iface.graph_compute(backend, cgraph);
283
  }
284
 
 
286
  return backend->iface.supports_op(backend, op);
287
  }
288
 
289
+ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
+ if (backend->iface.offload_op != NULL) {
291
+ return backend->iface.offload_op(backend, op);
292
+ }
293
+ return false;
294
+ }
295
+
296
  // backend copy
297
 
298
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
 
420
  ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
421
 
422
  // add forward decls here to avoid including the backend headers
423
+ #ifdef GGML_USE_CUDA
424
  extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
425
  ggml_backend_cuda_reg_devices();
426
  #endif
 
768
 
769
  if (cpu_plan->cplan.work_size > 0) {
770
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
771
+ if (cpu_plan->cplan.work_data == NULL) {
772
+ free(cpu_plan);
773
+ return NULL;
774
+ }
775
  }
776
 
777
  cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
 
845
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
846
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
847
  /* .supports_op = */ ggml_backend_cpu_supports_op,
848
+ /* .offload_op = */ NULL,
849
  /* .event_new = */ NULL,
850
  /* .event_free = */ NULL,
851
  /* .event_record = */ NULL,
 
1011
  #endif
1012
 
1013
  #ifndef GGML_SCHED_MAX_SPLITS
1014
+ #define GGML_SCHED_MAX_SPLITS 2048
1015
  #endif
1016
 
1017
  #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1018
+ #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1019
  #endif
1020
 
1021
  #ifndef GGML_SCHED_MAX_COPIES
 
1055
  struct ggml_cgraph * graph;
1056
 
1057
  // graph splits
1058
+ struct ggml_backend_sched_split * splits;
1059
  int n_splits;
1060
+ int splits_capacity;
1061
 
1062
  // pipeline parallelism support
1063
  int n_copies;
 
1127
  // TODO: use supports_op to check if the backend supports the op
1128
 
1129
  // assign pre-allocated nodes to their backend
1130
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
1131
+ if (cur_backend_id != -1) {
 
1132
  SET_CAUSE(tensor, "1.dst");
1133
+ return cur_backend_id;
1134
  }
1135
 
1136
  // view_src
1137
  if (tensor->view_src != NULL) {
1138
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1139
+ if (cur_backend_id != -1) {
1140
  SET_CAUSE(tensor, "1.vsrc");
1141
+ return cur_backend_id;
1142
  }
1143
  }
1144
 
1145
+ // graph input
1146
  if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1147
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1148
  SET_CAUSE(tensor, "1.inp");
1149
+ return cur_backend_id;
1150
  }
1151
 
1152
  // assign nodes that use weights to the backend of the weights
1153
+ // operations with weights are preferably run on the same backend as the weights
1154
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1155
  const struct ggml_tensor * src = tensor->src[i];
1156
  if (src == NULL) {
1157
  continue;
1158
  }
1159
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1160
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
1161
+ // check if a backend with higher prio wants to offload the op
1162
+ if (src_backend_id == sched->n_backends - 1) {
1163
+ for (int b = 0; b < src_backend_id; b++) {
1164
+ if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1165
+ SET_CAUSE(tensor, "1.off");
1166
+ return b;
1167
+ }
1168
+ }
1169
+ }
1170
  SET_CAUSE(tensor, "1.wgt%d", i);
1171
+ return src_backend_id;
1172
  }
1173
  }
1174
 
 
1248
  // pass 1: assign backends to ops with pre-allocated inputs
1249
  for (int i = 0; i < graph->n_leafs; i++) {
1250
  struct ggml_tensor * leaf = graph->leafs[i];
1251
+ int * leaf_backend_id = &tensor_backend_id(leaf);
1252
+ if (*leaf_backend_id != -1) {
1253
  // do not overwrite user assignments
1254
  continue;
1255
  }
1256
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1257
  }
1258
 
1259
  for (int i = 0; i < graph->n_nodes; i++) {
1260
  struct ggml_tensor * node = graph->nodes[i];
1261
+ int * node_backend_id = &tensor_backend_id(node);
1262
+ if (*node_backend_id != -1) {
1263
  // do not overwrite user assignments
1264
  continue;
1265
  }
1266
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1267
  // src
1268
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1269
  struct ggml_tensor * src = node->src[j];
1270
  if (src == NULL) {
1271
  continue;
1272
  }
1273
+ int * src_backend_id = &tensor_backend_id(src);
1274
+ if (*src_backend_id == -1) {
1275
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1276
  }
1277
  }
1278
  }
 
1294
  if (ggml_is_view_op(node->op)) {
1295
  continue;
1296
  }
1297
+ int * node_backend_id = &tensor_backend_id(node);
1298
+ if (*node_backend_id != -1) {
1299
+ if (*node_backend_id == sched->n_backends - 1) {
1300
  // skip cpu (lowest prio backend)
1301
  cur_backend_id = -1;
1302
  } else {
1303
+ cur_backend_id = *node_backend_id;
1304
  }
1305
  } else {
1306
+ *node_backend_id = cur_backend_id;
1307
  SET_CAUSE(node, "2.2");
1308
  }
1309
  }
1310
  }
 
1311
  // pass 2.1 expand gpu up
1312
  {
1313
  int cur_backend_id = -1;
 
1316
  if (ggml_is_view_op(node->op)) {
1317
  continue;
1318
  }
1319
+ int * node_backend_id = &tensor_backend_id(node);
1320
+ if (*node_backend_id != -1) {
1321
+ if (*node_backend_id == sched->n_backends - 1) {
1322
  // skip cpu (lowest prio backend)
1323
  cur_backend_id = -1;
1324
  } else {
1325
+ cur_backend_id = *node_backend_id;
1326
  }
1327
  } else {
1328
+ *node_backend_id = cur_backend_id;
1329
  SET_CAUSE(node, "2.1");
1330
  }
1331
  }
1332
  }
 
 
1333
  // pass 2.4 expand rest down
1334
  {
1335
  int cur_backend_id = -1;
 
1338
  if (ggml_is_view_op(node->op)) {
1339
  continue;
1340
  }
1341
+ int * node_backend_id = &tensor_backend_id(node);
1342
+ if (*node_backend_id != -1) {
1343
+ cur_backend_id = *node_backend_id;
1344
  } else {
1345
+ *node_backend_id = cur_backend_id;
1346
  SET_CAUSE(node, "2.4");
1347
  }
1348
  }
1349
  }
1350
+ // pass 2.3 expand rest up
1351
  {
1352
  int cur_backend_id = -1;
1353
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
 
1355
  if (ggml_is_view_op(node->op)) {
1356
  continue;
1357
  }
1358
+ int * node_backend_id = &tensor_backend_id(node);
1359
+ if (*node_backend_id != -1) {
1360
+ cur_backend_id = *node_backend_id;
1361
  } else {
1362
+ *node_backend_id = cur_backend_id;
1363
  SET_CAUSE(node, "2.3");
1364
  }
1365
  }
 
1372
  // pass 3: assign backends to remaining src from dst and view_src
1373
  for (int i = 0; i < graph->n_nodes; i++) {
1374
  struct ggml_tensor * node = graph->nodes[i];
1375
+ int * cur_backend_id = &tensor_backend_id(node);
1376
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1377
+ *cur_backend_id = tensor_backend_id(node->view_src);
1378
  SET_CAUSE(node, "3.vsrc");
1379
  }
1380
  for (int j = 0; j < GGML_MAX_SRC; j++) {
 
1382
  if (src == NULL) {
1383
  continue;
1384
  }
1385
+ int * src_backend_id = &tensor_backend_id(src);
1386
+ if (*src_backend_id == -1) {
1387
  if (src->view_src != NULL) {
1388
  // views are always on the same backend as the source
1389
+ *src_backend_id = tensor_backend_id(src->view_src);
1390
  SET_CAUSE(src, "3.vsrc");
1391
  } else {
1392
+ *src_backend_id = *cur_backend_id;
1393
  SET_CAUSE(src, "3.cur");
1394
  }
1395
  }
 
1401
 
1402
  // pass 4: split graph, find tensors that need to be copied
1403
  {
1404
+ int i_split = 0;
1405
+ struct ggml_backend_sched_split * split = &sched->splits[0];
1406
  // find the backend of the first split, skipping view ops
1407
  for (int i = 0; i < graph->n_nodes; i++) {
1408
  struct ggml_tensor * node = graph->nodes[i];
1409
  if (!ggml_is_view_op(node->op)) {
1410
+ split->backend_id = tensor_backend_id(node);
1411
  break;
1412
  }
1413
  }
1414
+ split->i_start = 0;
1415
+ split->n_inputs = 0;
1416
+ memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1417
+ int cur_backend_id = split->backend_id;
1418
  for (int i = 0; i < graph->n_nodes; i++) {
1419
  struct ggml_tensor * node = graph->nodes[i];
1420
 
 
1422
  continue;
1423
  }
1424
 
1425
+ const int node_backend_id = tensor_backend_id(node);
1426
 
1427
+ GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
1428
 
1429
+ // check if we should start a new split based on the sources of the current node
1430
+ bool need_new_split = false;
1431
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1432
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1433
+ struct ggml_tensor * src = node->src[j];
1434
+ if (src == NULL) {
1435
+ continue;
1436
+ }
1437
+ // check if a weight is on a different backend
1438
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1439
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1440
+ int src_backend_id = tensor_backend_id(src);
1441
+ if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1442
+ need_new_split = true;
1443
+ break;
1444
+ }
1445
+ }
1446
+ // check if the split has too many inputs
1447
+ if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1448
+ const size_t id = hash_id(src);
1449
+ int src_backend_id = sched->tensor_backend_id[id];
1450
+ if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1451
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1452
+ need_new_split = true;
1453
+ break;
1454
+ }
1455
+ }
1456
+ }
1457
+ }
1458
+
1459
+ if (node_backend_id != cur_backend_id || need_new_split) {
1460
+ split->i_end = i;
1461
+ i_split++;
1462
+ if (i_split >= sched->splits_capacity) {
1463
+ sched->splits_capacity *= 2;
1464
+ sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1465
+ GGML_ASSERT(sched->splits != NULL);
1466
+ }
1467
+ GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
1468
+ split = &sched->splits[i_split];
1469
+ split->backend_id = node_backend_id;
1470
+ split->i_start = i;
1471
+ split->n_inputs = 0;
1472
+ cur_backend_id = node_backend_id;
1473
  }
1474
 
1475
  // find inputs that are not on the same backend
 
1479
  continue;
1480
  }
1481
 
1482
+ const int src_backend_id = tensor_backend_id(src);
1483
  assert(src_backend_id != -1); // all inputs should be assigned by now
1484
 
1485
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1486
  size_t id = hash_id(src);
1487
  if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1488
  ggml_backend_t backend = sched->backends[src_backend_id];
 
1499
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1500
  }
1501
  sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
 
1502
  SET_CAUSE(tensor_copy, "4.cpy");
1503
  }
1504
  int n_graph_inputs = sched->n_graph_inputs++;
 
1507
  }
1508
  }
1509
 
1510
+ if (src_backend_id != node_backend_id) {
1511
  // create a copy of the input in the split's backend
1512
+ const size_t id = hash_id(src);
1513
  if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1514
  ggml_backend_t backend = sched->backends[cur_backend_id];
1515
  for (int c = 0; c < sched->n_copies; c++) {
 
1520
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1521
  }
1522
  sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
 
1523
  SET_CAUSE(tensor_copy, "4.cpy");
1524
  }
1525
+ int n_inputs = split->n_inputs++;
1526
  GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1527
+ split->inputs[n_inputs] = src;
1528
  }
1529
  node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1530
  }
1531
  }
1532
  }
1533
+ split->i_end = graph->n_nodes;
1534
+ sched->n_splits = i_split + 1;
1535
  }
1536
  #ifdef DEBUG_PASS4
1537
  fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1538
  #endif
1539
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1540
  // create copies of the graph for each split
1541
  // TODO: avoid this copy
1542
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
1543
  for (int i = 0; i < sched->n_splits; i++) {
1544
  struct ggml_backend_sched_split * split = &sched->splits[i];
1545
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1546
 
1547
  // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1548
  for (int j = 0; j < split->n_inputs; j++) {
1549
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1550
+
1551
  struct ggml_tensor * input = split->inputs[j];
1552
+ const size_t input_id = hash_id(input);
1553
+ struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
1554
 
1555
  // add a dependency to the input source so that it is not freed before the copy is done
1556
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1557
  input_dep->src[0] = input;
1558
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
1559
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1560
 
1561
  // add a dependency to the input copy so that it is allocated at the start of the split
 
1564
  }
1565
 
1566
  for (int j = split->i_start; j < split->i_end; j++) {
1567
+ assert(graph_copy->size > graph_copy->n_nodes);
1568
  sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1569
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1570
  }
 
1649
  }
1650
  ggml_backend_tensor_copy(input, input_cpy);
1651
  } else {
1652
+ // wait for the split backend to finish using the input before overwriting it
1653
  if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1654
  ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1655
  } else {
1656
  ggml_backend_synchronize(split_backend);
 
1657
  }
 
1658
  ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1659
  }
1660
  }
 
1724
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1725
 
1726
  // initialize hash table
1727
+ sched->hash_set = ggml_hash_set_new(graph_size);
1728
  sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1729
  sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1730
+
1731
+ const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
+ sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
+ sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1734
 
1735
  sched->n_backends = n_backends;
1736
 
1737
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1738
 
1739
+ const int initial_splits_capacity = 16;
1740
+ sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1741
+ sched->splits_capacity = initial_splits_capacity;
1742
 
1743
  for (int b = 0; b < n_backends; b++) {
1744
  sched->backends[b] = backends[b];
 
1769
  }
1770
  ggml_gallocr_free(sched->galloc);
1771
  ggml_free(sched->ctx);
1772
+ free(sched->splits);
1773
  free(sched->hash_set.keys);
1774
  free(sched->tensor_backend_id);
1775
  free(sched->tensor_copies);
 
1790
  }
1791
 
1792
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1793
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1794
+
1795
  ggml_backend_sched_split_graph(sched, measure_graph);
1796
 
1797
  // TODO: extract this to a separate function
 
1806
  }
1807
 
1808
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1809
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
1810
 
1811
  ggml_backend_sched_split_graph(sched, graph);
1812
 
ggml-backend.h CHANGED
@@ -70,11 +70,11 @@ extern "C" {
70
  GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
71
  GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
72
 
73
- GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
74
- GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
-
76
- GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
77
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
 
78
 
79
  // tensor copy between different backends
80
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
 
70
  GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
71
  GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
72
 
73
+ GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
74
+ GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
+ GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
76
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
77
+ GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
 
79
  // tensor copy between different backends
80
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
ggml-common.h CHANGED
@@ -377,6 +377,27 @@ typedef struct {
377
  } block_iq1_s;
378
  static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  // Non-linear quants
381
  #define QK4_NL 32
382
  typedef struct {
@@ -1050,6 +1071,7 @@ GGML_TABLE_END()
1050
 
1051
  #define NGRID_IQ1S 2048
1052
  #define IQ1S_DELTA 0.125f
 
1053
  #if defined(GGML_COMMON_IMPL_C)
1054
  GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
1055
  0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
 
377
  } block_iq1_s;
378
  static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
379
 
380
+ // 1.75 bpw
381
+ typedef struct {
382
+ uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
+ uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
+ #if QK_K == 64
385
+ ggml_half d;
386
+ #endif
387
+ uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
+ } block_iq1_m;
389
+ #if QK_K == 64
390
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
+ #else
392
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
+ #endif
394
+
395
+ // Used by IQ1_M quants
396
+ typedef union {
397
+ ggml_half f16;
398
+ uint16_t u16;
399
+ } iq1m_scale_t;
400
+
401
  // Non-linear quants
402
  #define QK4_NL 32
403
  typedef struct {
 
1071
 
1072
  #define NGRID_IQ1S 2048
1073
  #define IQ1S_DELTA 0.125f
1074
+ #define IQ1M_DELTA 0.125f
1075
  #if defined(GGML_COMMON_IMPL_C)
1076
  GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
1077
  0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
ggml-cuda.cu CHANGED
The diff for this file is too large to render. See raw diff
 
ggml-cuda.h CHANGED
@@ -17,29 +17,17 @@ extern "C" {
17
 
18
  #define GGML_CUDA_MAX_DEVICES 16
19
 
20
- // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
21
- GGML_API GGML_CALL void ggml_init_cublas(void);
22
-
23
- // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
24
- GGML_API GGML_CALL bool ggml_cublas_loaded(void);
25
-
26
- GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
27
- GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
28
-
29
- GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
30
- GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
31
-
32
- GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
33
- GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
34
-
35
  // backend API
36
  GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
37
 
38
  GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
39
 
 
40
  GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 
41
  // split tensor buffer that splits matrices by rows across multiple devices
42
  GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
 
43
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
44
  GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
45
 
@@ -47,6 +35,9 @@ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
47
  GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
48
  GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
49
 
 
 
 
50
  #ifdef __cplusplus
51
  }
52
  #endif
 
17
 
18
  #define GGML_CUDA_MAX_DEVICES 16
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  // backend API
21
  GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
22
 
23
  GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
24
 
25
+ // device buffer
26
  GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
27
+
28
  // split tensor buffer that splits matrices by rows across multiple devices
29
  GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
30
+
31
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
32
  GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
33
 
 
35
  GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
36
  GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
37
 
38
+ GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
+ GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
+
41
  #ifdef __cplusplus
42
  }
43
  #endif
ggml-cuda/acc.cu ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "acc.cuh"
2
+
3
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
4
+ const int ne10, const int ne11, const int ne12,
5
+ const int nb1, const int nb2, int offset) {
6
+ const int i = blockDim.x * blockIdx.x + threadIdx.x;
7
+ if (i >= ne) {
8
+ return;
9
+ }
10
+ int src1_idx = i - offset;
11
+ int oz = src1_idx / nb2;
12
+ int oy = (src1_idx - (oz * nb2)) / nb1;
13
+ int ox = src1_idx % nb1;
14
+ if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
15
+ dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
16
+ } else {
17
+ dst[i] = x[i];
18
+ }
19
+ }
20
+
21
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
22
+ const int ne10, const int ne11, const int ne12,
23
+ const int nb1, const int nb2, const int offset, cudaStream_t stream) {
24
+ int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
25
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
26
+ }
27
+
28
+ void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
29
+ const ggml_tensor * src0 = dst->src[0];
30
+ const ggml_tensor * src1 = dst->src[1];
31
+ const float * src0_d = (const float *)src0->data;
32
+ const float * src1_d = (const float *)src1->data;
33
+ float * dst_d = (float *)dst->data;
34
+ cudaStream_t stream = ctx.stream();
35
+
36
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
37
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
38
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
39
+ GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
40
+
41
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
42
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
43
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
44
+ int offset = dst->op_params[3] / 4; // offset in bytes
45
+
46
+ acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
47
+ }
ggml-cuda/acc.cuh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_ACC_BLOCK_SIZE 256
4
+
5
+ void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/alibi.cu ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "alibi.cuh"
2
+
3
+ static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
4
+ const int n_heads_log2_floor, const float m0, const float m1) {
5
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
6
+
7
+ if (col >= ncols) {
8
+ return;
9
+ }
10
+
11
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
12
+ const int i = row*ncols + col;
13
+
14
+ const int k = row/k_rows;
15
+
16
+ float m_k;
17
+ if (k < n_heads_log2_floor) {
18
+ m_k = powf(m0, k + 1);
19
+ } else {
20
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
21
+ }
22
+
23
+ dst[i] = col * m_k + x[i];
24
+ }
25
+
26
+ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
27
+ const int k_rows, const int n_heads_log2_floor, const float m0,
28
+ const float m1, cudaStream_t stream) {
29
+ const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
30
+ const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
31
+ const dim3 block_nums(num_blocks_x, nrows, 1);
32
+ alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
33
+ }
34
+
35
+ void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
36
+ const ggml_tensor * src0 = dst->src[0];
37
+ const float * src0_d = (const float *)src0->data;
38
+ float * dst_d = (float *)dst->data;
39
+ cudaStream_t stream = ctx.stream();
40
+
41
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
42
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
43
+
44
+ const int64_t ne00 = src0->ne[0];
45
+ const int64_t ne01 = src0->ne[1];
46
+ const int64_t ne02 = src0->ne[2];
47
+ const int64_t nrows = ggml_nrows(src0);
48
+
49
+ //const int n_past = ((int32_t *) dst->op_params)[0];
50
+ const int n_head = ((int32_t *) dst->op_params)[1];
51
+ float max_bias;
52
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
53
+
54
+ //GGML_ASSERT(ne01 + n_past == ne00);
55
+ GGML_ASSERT(n_head == ne02);
56
+
57
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
58
+
59
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
60
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
61
+
62
+ alibi_f32_cuda(src0_d, dst_d, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, stream);
63
+ }
ggml-cuda/alibi.cuh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_ALIBI_BLOCK_SIZE 32
4
+
5
+ void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/arange.cu ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "arange.cuh"
2
+
3
+ static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
4
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
5
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6
+ if (nidx >= ne0) {
7
+ return;
8
+ }
9
+ dst[nidx] = start + step * nidx;
10
+ }
11
+
12
+ static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
13
+ int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
14
+ arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
15
+ }
16
+
17
+ void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
18
+ float * dst_d = (float *)dst->data;
19
+ cudaStream_t stream = ctx.stream();
20
+
21
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
22
+
23
+ float start;
24
+ float stop;
25
+ float step;
26
+ memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
27
+ memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
28
+ memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
29
+
30
+ int64_t steps = (int64_t)ceil((stop - start) / step);
31
+ GGML_ASSERT(ggml_nelements(dst) == steps);
32
+
33
+ arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
34
+ }
ggml-cuda/arange.cuh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_ARANGE_BLOCK_SIZE 256
4
+
5
+ void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/argsort.cu ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "argsort.cuh"
2
+
3
+ template<typename T>
4
+ static inline __device__ void ggml_cuda_swap(T & a, T & b) {
5
+ T tmp = a;
6
+ a = b;
7
+ b = tmp;
8
+ }
9
+
10
+ template<ggml_sort_order order>
11
+ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
12
+ // bitonic sort
13
+ int col = threadIdx.x;
14
+ int row = blockIdx.y;
15
+
16
+ if (col >= ncols) return;
17
+
18
+ const float * x_row = x + row * ncols;
19
+ int * dst_row = dst + row * ncols;
20
+
21
+ // initialize indices
22
+ if (col < ncols) {
23
+ dst_row[col] = col;
24
+ }
25
+ __syncthreads();
26
+
27
+ for (int k = 2; k <= ncols; k *= 2) {
28
+ for (int j = k / 2; j > 0; j /= 2) {
29
+ int ixj = col ^ j;
30
+ if (ixj > col) {
31
+ if ((col & k) == 0) {
32
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
33
+ ggml_cuda_swap(dst_row[col], dst_row[ixj]);
34
+ }
35
+ } else {
36
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
37
+ ggml_cuda_swap(dst_row[col], dst_row[ixj]);
38
+ }
39
+ }
40
+ }
41
+ __syncthreads();
42
+ }
43
+ }
44
+ }
45
+
46
+ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
47
+ // bitonic sort requires ncols to be power of 2
48
+ GGML_ASSERT((ncols & (ncols - 1)) == 0);
49
+
50
+ const dim3 block_dims(ncols, 1, 1);
51
+ const dim3 block_nums(1, nrows, 1);
52
+ if (order == GGML_SORT_ORDER_ASC) {
53
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
54
+ } else if (order == GGML_SORT_ORDER_DESC) {
55
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
56
+ } else {
57
+ GGML_ASSERT(false);
58
+ }
59
+ }
60
+
61
+ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
62
+ const ggml_tensor * src0 = dst->src[0];
63
+ const float * src0_d = (const float *)src0->data;
64
+ float * dst_d = (float *)dst->data;
65
+ cudaStream_t stream = ctx.stream();
66
+
67
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
68
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
69
+ GGML_ASSERT(ggml_is_contiguous(src0));
70
+
71
+ const int64_t ncols = src0->ne[0];
72
+ const int64_t nrows = ggml_nrows(src0);
73
+
74
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
75
+
76
+ argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
77
+ }
ggml-cuda/argsort.cuh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/binbcast.cu ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "binbcast.cuh"
2
+
3
+ static __device__ __forceinline__ float op_repeat(const float a, const float b) {
4
+ return b;
5
+ GGML_UNUSED(a);
6
+ }
7
+
8
+ static __device__ __forceinline__ float op_add(const float a, const float b) {
9
+ return a + b;
10
+ }
11
+
12
+ static __device__ __forceinline__ float op_mul(const float a, const float b) {
13
+ return a * b;
14
+ }
15
+
16
+ static __device__ __forceinline__ float op_div(const float a, const float b) {
17
+ return a / b;
18
+ }
19
+
20
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
21
+ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
22
+ int ne0, int ne1, int ne2, int ne3,
23
+ int ne10, int ne11, int ne12, int ne13,
24
+ /*int s0, */ int s1, int s2, int s3,
25
+ /*int s10,*/ int s11, int s12, int s13) {
26
+ const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
27
+ const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
28
+ const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
29
+ const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
30
+
31
+ if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
32
+ return;
33
+ }
34
+
35
+ const int i11 = i1 % ne11;
36
+ const int i12 = i2 % ne12;
37
+ const int i13 = i3 % ne13;
38
+
39
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
40
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
41
+ const size_t i_dst = i_src0;
42
+
43
+ const src0_t * src0_row = src0 + i_src0;
44
+ const src1_t * src1_row = src1 + i_src1;
45
+ dst_t * dst_row = dst + i_dst;
46
+
47
+ for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
48
+ const int i10 = i0 % ne10;
49
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
50
+ }
51
+ }
52
+
53
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
54
+ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
55
+ int ne0, int ne1, int ne2, int ne3,
56
+ int ne10, int ne11, int ne12, int ne13,
57
+ /*int s0, */ int s1, int s2, int s3,
58
+ /*int s10,*/ int s11, int s12, int s13) {
59
+
60
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
61
+
62
+ const int i3 = i/(ne2*ne1*ne0);
63
+ const int i2 = (i/(ne1*ne0)) % ne2;
64
+ const int i1 = (i/ne0) % ne1;
65
+ const int i0 = i % ne0;
66
+
67
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
68
+ return;
69
+ }
70
+
71
+ const int i11 = i1 % ne11;
72
+ const int i12 = i2 % ne12;
73
+ const int i13 = i3 % ne13;
74
+
75
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
76
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
77
+ const size_t i_dst = i_src0;
78
+
79
+ const src0_t * src0_row = src0 + i_src0;
80
+ const src1_t * src1_row = src1 + i_src1;
81
+ dst_t * dst_row = dst + i_dst;
82
+
83
+ const int i10 = i0 % ne10;
84
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
85
+ }
86
+
87
+ template<float (*bin_op)(const float, const float)>
88
+ struct bin_bcast_cuda {
89
+ template<typename src0_t, typename src1_t, typename dst_t>
90
+ void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
91
+ const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
92
+ cudaStream_t stream) {
93
+
94
+ GGML_TENSOR_BINARY_OP_LOCALS
95
+
96
+ int nr0 = ne10/ne0;
97
+ int nr1 = ne11/ne1;
98
+ int nr2 = ne12/ne2;
99
+ int nr3 = ne13/ne3;
100
+
101
+ int nr[4] = { nr0, nr1, nr2, nr3 };
102
+
103
+ // collapse dimensions until first broadcast dimension
104
+ int64_t cne0[] = {ne0, ne1, ne2, ne3};
105
+ int64_t cne1[] = {ne10, ne11, ne12, ne13};
106
+ size_t cnb0[] = {nb0, nb1, nb2, nb3};
107
+ size_t cnb1[] = {nb10, nb11, nb12, nb13};
108
+ auto collapse = [](int64_t cne[]) {
109
+ cne[0] *= cne[1];
110
+ cne[1] = cne[2];
111
+ cne[2] = cne[3];
112
+ cne[3] = 1;
113
+ };
114
+
115
+ auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
116
+ cnb[1] *= cne[1];
117
+ cnb[2] *= cne[2];
118
+ cnb[3] *= cne[3];
119
+ };
120
+
121
+ for (int i = 0; i < 4; i++) {
122
+ if (nr[i] != 1) {
123
+ break;
124
+ }
125
+ if (i > 0) {
126
+ collapse_nb(cnb0, cne0);
127
+ collapse_nb(cnb1, cne1);
128
+ collapse(cne0);
129
+ collapse(cne1);
130
+ }
131
+ }
132
+ {
133
+ int64_t ne0 = cne0[0];
134
+ int64_t ne1 = cne0[1];
135
+ int64_t ne2 = cne0[2];
136
+ int64_t ne3 = cne0[3];
137
+
138
+ int64_t ne10 = cne1[0];
139
+ int64_t ne11 = cne1[1];
140
+ int64_t ne12 = cne1[2];
141
+ int64_t ne13 = cne1[3];
142
+
143
+ size_t nb0 = cnb0[0];
144
+ size_t nb1 = cnb0[1];
145
+ size_t nb2 = cnb0[2];
146
+ size_t nb3 = cnb0[3];
147
+
148
+ size_t nb10 = cnb1[0];
149
+ size_t nb11 = cnb1[1];
150
+ size_t nb12 = cnb1[2];
151
+ size_t nb13 = cnb1[3];
152
+
153
+ size_t s0 = nb0 / sizeof(dst_t);
154
+ size_t s1 = nb1 / sizeof(dst_t);
155
+ size_t s2 = nb2 / sizeof(dst_t);
156
+ size_t s3 = nb3 / sizeof(dst_t);
157
+
158
+ size_t s10 = nb10 / sizeof(src1_t);
159
+ size_t s11 = nb11 / sizeof(src1_t);
160
+ size_t s12 = nb12 / sizeof(src1_t);
161
+ size_t s13 = nb13 / sizeof(src1_t);
162
+
163
+ GGML_ASSERT(s0 == 1);
164
+ GGML_ASSERT(s10 == 1);
165
+
166
+ const int block_size = 128;
167
+
168
+ int64_t hne0 = std::max(ne0/2LL, 1LL);
169
+
170
+ dim3 block_dims;
171
+ block_dims.x = std::min<unsigned int>(hne0, block_size);
172
+ block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
173
+ block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
174
+
175
+ dim3 block_nums(
176
+ (hne0 + block_dims.x - 1) / block_dims.x,
177
+ (ne1 + block_dims.y - 1) / block_dims.y,
178
+ (ne2*ne3 + block_dims.z - 1) / block_dims.z
179
+ );
180
+
181
+ if (block_nums.z > 65535) {
182
+ // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
183
+ int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
184
+ k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
185
+ src0_dd, src1_dd, dst_dd,
186
+ ne0, ne1, ne2, ne3,
187
+ ne10, ne11, ne12, ne13,
188
+ /* s0, */ s1, s2, s3,
189
+ /* s10, */ s11, s12, s13);
190
+ } else {
191
+ k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
192
+ src0_dd, src1_dd, dst_dd,
193
+ ne0, ne1, ne2, ne3,
194
+ ne10, ne11, ne12, ne13,
195
+ /* s0, */ s1, s2, s3,
196
+ /* s10, */ s11, s12, s13);
197
+ }
198
+ }
199
+ }
200
+ };
201
+
202
+ template<class op>
203
+ static void ggml_cuda_op_bin_bcast(
204
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
205
+ const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
206
+
207
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
208
+
209
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
210
+ op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
211
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
212
+ op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
213
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
214
+ op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
215
+ } else {
216
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
217
+ ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
218
+ GGML_ASSERT(false);
219
+ }
220
+ }
221
+
222
+ void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
223
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
224
+ }
225
+
226
+ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
227
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
228
+ }
229
+
230
+ void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
231
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
232
+ }
233
+
234
+ void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
235
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
236
+ }
ggml-cuda/binbcast.cuh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4
+ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
5
+ void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6
+ void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/clamp.cu ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "clamp.cuh"
2
+
3
+ static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
5
+
6
+ if (i >= k) {
7
+ return;
8
+ }
9
+
10
+ dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
11
+ }
12
+
13
+ static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
14
+ const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
15
+ clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
16
+ }
17
+
18
+
19
+ void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
20
+ const ggml_tensor * src0 = dst->src[0];
21
+ const float * src0_d = (const float *)src0->data;
22
+ float * dst_d = (float *)dst->data;
23
+ cudaStream_t stream = ctx.stream();
24
+
25
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
26
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
27
+
28
+ float min;
29
+ float max;
30
+ memcpy(&min, dst->op_params, sizeof(float));
31
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
32
+
33
+ clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
34
+ CUDA_CHECK(cudaGetLastError());
35
+ }
ggml-cuda/clamp.cuh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_CLAMP_BLOCK_SIZE 256
4
+
5
+ void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/common.cuh ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-cuda.h"
5
+
6
+ #include <memory>
7
+
8
+ #if defined(GGML_USE_HIPBLAS)
9
+ #define GGML_COMMON_DECL_HIP
10
+ #define GGML_COMMON_IMPL_HIP
11
+ #else
12
+ #define GGML_COMMON_DECL_CUDA
13
+ #define GGML_COMMON_IMPL_CUDA
14
+ #endif
15
+ #include "ggml-common.h"
16
+
17
+ #include <cstdio>
18
+ #include <array>
19
+ #include <cassert>
20
+ #include <cfloat>
21
+ #include <string>
22
+
23
+ #if defined(GGML_USE_HIPBLAS)
24
+ #include <hip/hip_runtime.h>
25
+ #include <hipblas/hipblas.h>
26
+ #include <hip/hip_fp16.h>
27
+ #ifdef __HIP_PLATFORM_AMD__
28
+ // for rocblas_initialize()
29
+ #include "rocblas/rocblas.h"
30
+ #endif // __HIP_PLATFORM_AMD__
31
+ #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
32
+ #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
33
+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
34
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
35
+ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
36
+ #define CUBLAS_OP_N HIPBLAS_OP_N
37
+ #define CUBLAS_OP_T HIPBLAS_OP_T
38
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
39
+ #define CUBLAS_TF32_TENSOR_OP_MATH 0
40
+ #define CUDA_R_16F HIPBLAS_R_16F
41
+ #define CUDA_R_32F HIPBLAS_R_32F
42
+ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
43
+ #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
44
+ #define cublasCreate hipblasCreate
45
+ #define cublasDestroy hipblasDestroy
46
+ #define cublasGemmEx hipblasGemmEx
47
+ #define cublasGemmBatchedEx hipblasGemmBatchedEx
48
+ #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
49
+ #define cublasHandle_t hipblasHandle_t
50
+ #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
51
+ #define cublasSetStream hipblasSetStream
52
+ #define cublasSgemm hipblasSgemm
53
+ #define cublasStatus_t hipblasStatus_t
54
+ #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
55
+ #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
56
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
57
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
58
+ #define cudaDeviceProp hipDeviceProp_t
59
+ #define cudaDeviceSynchronize hipDeviceSynchronize
60
+ #define cudaError_t hipError_t
61
+ #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
62
+ #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
63
+ #define cudaEventCreateWithFlags hipEventCreateWithFlags
64
+ #define cudaEventDisableTiming hipEventDisableTiming
65
+ #define cudaEventRecord hipEventRecord
66
+ #define cudaEventSynchronize hipEventSynchronize
67
+ #define cudaEvent_t hipEvent_t
68
+ #define cudaEventDestroy hipEventDestroy
69
+ #define cudaFree hipFree
70
+ #define cudaFreeHost hipHostFree
71
+ #define cudaGetDevice hipGetDevice
72
+ #define cudaGetDeviceCount hipGetDeviceCount
73
+ #define cudaGetDeviceProperties hipGetDeviceProperties
74
+ #define cudaGetErrorString hipGetErrorString
75
+ #define cudaGetLastError hipGetLastError
76
+ #define cudaHostRegister hipHostRegister
77
+ #define cudaHostRegisterPortable hipHostRegisterPortable
78
+ #define cudaHostRegisterReadOnly hipHostRegisterReadOnly
79
+ #define cudaHostUnregister hipHostUnregister
80
+ #define cudaLaunchHostFunc hipLaunchHostFunc
81
+ #ifdef GGML_HIP_UMA
82
+ #define cudaMalloc hipMallocManaged
83
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
84
+ #else
85
+ #define cudaMalloc hipMalloc
86
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
87
+ #endif
88
+ #define cudaMemcpy hipMemcpy
89
+ #define cudaMemcpyAsync hipMemcpyAsync
90
+ #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
91
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
92
+ #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
93
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
94
+ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
95
+ #define cudaMemcpyKind hipMemcpyKind
96
+ #define cudaMemset hipMemset
97
+ #define cudaMemsetAsync hipMemsetAsync
98
+ #define cudaMemGetInfo hipMemGetInfo
99
+ #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
100
+ #define cudaSetDevice hipSetDevice
101
+ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
102
+ #define cudaStreamDestroy hipStreamDestroy
103
+ #define cudaStreamFireAndForget hipStreamFireAndForget
104
+ #define cudaStreamNonBlocking hipStreamNonBlocking
105
+ #define cudaStreamPerThread hipStreamPerThread
106
+ #define cudaStreamSynchronize hipStreamSynchronize
107
+ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
108
+ #define cudaStream_t hipStream_t
109
+ #define cudaSuccess hipSuccess
110
+ #define __trap abort
111
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
112
+ #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
113
+ #define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
114
+ #define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
115
+ #define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
116
+ #define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
117
+ #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
118
+ #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
119
+ #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
120
+ #else
121
+ #include <cuda_runtime.h>
122
+ #include <cuda.h>
123
+ #include <cublas_v2.h>
124
+ #include <cuda_fp16.h>
125
+
126
+ #if CUDART_VERSION < 11020
127
+ #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
128
+ #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
129
+ #define CUBLAS_COMPUTE_16F CUDA_R_16F
130
+ #define CUBLAS_COMPUTE_32F CUDA_R_32F
131
+ #define cublasComputeType_t cudaDataType_t
132
+ #endif // CUDART_VERSION < 11020
133
+
134
+ #endif // defined(GGML_USE_HIPBLAS)
135
+
136
+ #define STRINGIZE_IMPL(...) #__VA_ARGS__
137
+ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
138
+
139
+ #define WARP_SIZE 32
140
+ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
141
+
142
+ #define CC_PASCAL 600
143
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
144
+ #define CC_VOLTA 700
145
+ #define CC_OFFSET_AMD 1000000
146
+ #define CC_RDNA1 (CC_OFFSET_AMD + 1010)
147
+ #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
148
+ #define CC_RDNA3 (CC_OFFSET_AMD + 1100)
149
+
150
+ // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
151
+ // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
152
+ // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
153
+ // - 7B quantum model: +100-200 MB
154
+ // - 13B quantum model: +200-400 MB
155
+ //
156
+ //#define GGML_CUDA_FORCE_MMQ
157
+
158
+ // TODO: improve this to be correct for more hardware
159
+ // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
160
+ #if !defined(GGML_CUDA_FORCE_MMQ)
161
+ #define CUDA_USE_TENSOR_CORES
162
+ #endif
163
+
164
+ #define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
165
+ #define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
166
+
167
+ #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
168
+
169
+ #if defined(_MSC_VER)
170
+ #pragma warning(disable: 4244 4267) // possible loss of data
171
+ #endif
172
+
173
+ #define GGML_CUDA_MAX_STREAMS 8
174
+
175
+ [[noreturn]]
176
+ void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
177
+
178
+ #define CUDA_CHECK_GEN(err, success, error_fn) \
179
+ do { \
180
+ auto err_ = (err); \
181
+ if (err_ != (success)) { \
182
+ ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
183
+ } \
184
+ } while (0)
185
+
186
+ #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
187
+
188
+ #if CUDART_VERSION >= 12000
189
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
190
+ return cublasGetStatusString(err);
191
+ }
192
+ #else
193
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
194
+ switch (err) {
195
+ case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
196
+ case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
197
+ case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
198
+ case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
199
+ case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
200
+ case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
201
+ case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
202
+ case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
203
+ case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
204
+ default: return "unknown error";
205
+ }
206
+ }
207
+ #endif // CUDART_VERSION >= 12000
208
+
209
+ #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
210
+
211
+ #if !defined(GGML_USE_HIPBLAS)
212
+ static const char * cu_get_error_str(CUresult err) {
213
+ const char * err_str;
214
+ cuGetErrorString(err, &err_str);
215
+ return err_str;
216
+ }
217
+ #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
218
+ #endif
219
+
220
+ #if CUDART_VERSION >= 11100
221
+ #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
222
+ #else
223
+ #define GGML_CUDA_ASSUME(x)
224
+ #endif // CUDART_VERSION >= 11100
225
+
226
+ #ifdef GGML_CUDA_F16
227
+ typedef half dfloat; // dequantize float
228
+ typedef half2 dfloat2;
229
+ #else
230
+ typedef float dfloat; // dequantize float
231
+ typedef float2 dfloat2;
232
+ #endif //GGML_CUDA_F16
233
+
234
+ // dmmv = dequantize_mul_mat_vec
235
+ // TODO: remove this?
236
+ #ifndef GGML_CUDA_DMMV_X
237
+ #define GGML_CUDA_DMMV_X 32
238
+ #endif
239
+
240
+ [[noreturn]]
241
+ static __device__ void no_device_code(
242
+ const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
243
+
244
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
245
+ printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
246
+ file_name, line, function_name, arch);
247
+ GGML_UNUSED(arch_list);
248
+ #else
249
+ printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
250
+ file_name, line, function_name, arch, arch_list);
251
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
252
+ __trap();
253
+
254
+ GGML_UNUSED(no_device_code); // suppress unused function warning
255
+ }
256
+
257
+ #ifdef __CUDA_ARCH__
258
+ #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
259
+ #else
260
+ #define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
261
+ #endif // __CUDA_ARCH__
262
+
263
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
264
+ #pragma unroll
265
+ for (int mask = 16; mask > 0; mask >>= 1) {
266
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
267
+ }
268
+ return x;
269
+ }
270
+
271
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
272
+ #pragma unroll
273
+ for (int mask = 16; mask > 0; mask >>= 1) {
274
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
275
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
276
+ }
277
+ return a;
278
+ }
279
+
280
+ #ifdef GGML_CUDA_F16
281
+ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
282
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
283
+ #pragma unroll
284
+ for (int mask = 16; mask > 0; mask >>= 1) {
285
+ a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
286
+ }
287
+ return a;
288
+ #else
289
+ GGML_UNUSED(a);
290
+ NO_DEVICE_CODE;
291
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
292
+ }
293
+ #endif // GGML_CUDA_F16
294
+
295
+ static __device__ __forceinline__ float warp_reduce_max(float x) {
296
+ #pragma unroll
297
+ for (int mask = 16; mask > 0; mask >>= 1) {
298
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
299
+ }
300
+ return x;
301
+ }
302
+
303
+ //static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
304
+ //#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
305
+ //#pragma unroll
306
+ // for (int mask = 16; mask > 0; mask >>= 1) {
307
+ // x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
308
+ // }
309
+ // return x;
310
+ //#else
311
+ // GGML_UNUSED(x);
312
+ // NO_DEVICE_CODE;
313
+ //#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
314
+ //}
315
+
316
+
317
+ #if defined(GGML_USE_HIPBLAS)
318
+ #define __CUDA_ARCH__ 1300
319
+
320
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
321
+ defined(__gfx1150__) || defined(__gfx1151__)
322
+ #define RDNA3
323
+ #endif
324
+
325
+ #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
326
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
327
+ #define RDNA2
328
+ #endif
329
+
330
+ #ifndef __has_builtin
331
+ #define __has_builtin(x) 0
332
+ #endif
333
+
334
+ typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
335
+ typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
336
+ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
337
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
338
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
339
+ #if __has_builtin(__builtin_elementwise_sub_sat)
340
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
341
+ return reinterpret_cast<const int &>(c);
342
+ #else
343
+ int8x4_t c;
344
+ int16_t tmp;
345
+ #pragma unroll
346
+ for (int i = 0; i < 4; i++) {
347
+ tmp = va[i] - vb[i];
348
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
349
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
350
+ c[i] = tmp;
351
+ }
352
+ return reinterpret_cast<int &>(c);
353
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
354
+ }
355
+
356
+ static __device__ __forceinline__ int __vsub4(const int a, const int b) {
357
+ return __vsubss4(a, b);
358
+ }
359
+
360
+ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
361
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
362
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
363
+ unsigned int c;
364
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
365
+ #pragma unroll
366
+ for (int i = 0; i < 4; ++i) {
367
+ vc[i] = va[i] == vb[i] ? 0xff : 0x00;
368
+ }
369
+ return c;
370
+ }
371
+
372
+ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
373
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
374
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
375
+ #elif defined(RDNA3)
376
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
377
+ #elif defined(__gfx1010__) || defined(__gfx900__)
378
+ int tmp1;
379
+ int tmp2;
380
+ asm("\n \
381
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
382
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
383
+ v_add3_u32 %0, %1, %2, %0 \n \
384
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
385
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
386
+ v_add3_u32 %0, %1, %2, %0 \n \
387
+ "
388
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
389
+ : "v"(a), "v"(b)
390
+ );
391
+ #else
392
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
393
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
394
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
395
+ #endif
396
+ return c;
397
+ }
398
+ #endif // defined(GGML_USE_HIPBLAS)
399
+
400
+ // TODO: move to ggml-common.h
401
+ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
402
+
403
+ typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
404
+
405
+
406
+ //////////////////////
407
+
408
+ struct ggml_cuda_device_info {
409
+ int device_count;
410
+
411
+ struct cuda_device_info {
412
+ int cc; // compute capability
413
+ size_t smpb; // max. shared memory per block
414
+ bool vmm; // virtual memory support
415
+ size_t vmm_granularity; // granularity of virtual memory
416
+ size_t total_vram;
417
+ };
418
+
419
+ cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
420
+
421
+ std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
422
+ };
423
+
424
+ const ggml_cuda_device_info & ggml_cuda_info();
425
+
426
+ void ggml_cuda_set_device(int device);
427
+ int ggml_cuda_get_device();
428
+
429
+ struct ggml_cuda_pool {
430
+ virtual ~ggml_cuda_pool() = default;
431
+
432
+ virtual void * alloc(size_t size, size_t * actual_size) = 0;
433
+ virtual void free(void * ptr, size_t size) = 0;
434
+ };
435
+
436
+ template<typename T>
437
+ struct ggml_cuda_pool_alloc {
438
+ ggml_cuda_pool * pool = nullptr;
439
+ T * ptr = nullptr;
440
+ size_t actual_size = 0;
441
+
442
+ ggml_cuda_pool_alloc() = default;
443
+
444
+ explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
445
+ }
446
+
447
+ ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
448
+ alloc(size);
449
+ }
450
+
451
+ ~ggml_cuda_pool_alloc() {
452
+ if (ptr != nullptr) {
453
+ pool->free(ptr, actual_size);
454
+ }
455
+ }
456
+
457
+ // size is in number of elements
458
+ T * alloc(size_t size) {
459
+ GGML_ASSERT(pool != nullptr);
460
+ GGML_ASSERT(ptr == nullptr);
461
+ ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
462
+ return ptr;
463
+ }
464
+
465
+ T * alloc(ggml_cuda_pool & pool, size_t size) {
466
+ this->pool = &pool;
467
+ return alloc(size);
468
+ }
469
+
470
+ T * get() {
471
+ return ptr;
472
+ }
473
+
474
+ ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
475
+ ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
476
+ ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
477
+ ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
478
+ };
479
+
480
+
481
+ // backend interface
482
+
483
+ struct ggml_tensor_extra_gpu {
484
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
485
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
486
+ };
487
+
488
+ struct ggml_backend_cuda_context {
489
+ int device;
490
+ std::string name;
491
+ cudaEvent_t copy_event = nullptr;
492
+
493
+ cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
494
+ cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
495
+
496
+ explicit ggml_backend_cuda_context(int device) :
497
+ device(device),
498
+ name(GGML_CUDA_NAME + std::to_string(device)) {
499
+ }
500
+
501
+ ~ggml_backend_cuda_context() {
502
+ if (copy_event != nullptr) {
503
+ CUDA_CHECK(cudaEventDestroy(copy_event));
504
+ }
505
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
506
+ for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
507
+ if (streams[i][j] != nullptr) {
508
+ CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
509
+ }
510
+ }
511
+ if (cublas_handles[i] != nullptr) {
512
+ CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
513
+ }
514
+ }
515
+ }
516
+
517
+ cudaStream_t stream(int device, int stream) {
518
+ if (streams[device][stream] == nullptr) {
519
+ ggml_cuda_set_device(device);
520
+ CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
521
+ }
522
+ return streams[device][stream];
523
+ }
524
+
525
+ cudaStream_t stream() {
526
+ return stream(device, 0);
527
+ }
528
+
529
+ cublasHandle_t cublas_handle(int device) {
530
+ if (cublas_handles[device] == nullptr) {
531
+ ggml_cuda_set_device(device);
532
+ CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
533
+ CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
534
+ }
535
+ return cublas_handles[device];
536
+ }
537
+
538
+ cublasHandle_t cublas_handle() {
539
+ return cublas_handle(device);
540
+ }
541
+
542
+ // pool
543
+ std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
544
+
545
+ static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
546
+
547
+ ggml_cuda_pool & pool(int device) {
548
+ if (pools[device] == nullptr) {
549
+ pools[device] = new_pool_for_device(device);
550
+ }
551
+ return *pools[device];
552
+ }
553
+
554
+ ggml_cuda_pool & pool() {
555
+ return pool(device);
556
+ }
557
+ };
ggml-cuda/concat.cu ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "concat.cuh"
2
+
3
+ static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
4
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
5
+ if (nidx >= ne0) {
6
+ return;
7
+ }
8
+ // operation
9
+ int offset_dst =
10
+ nidx +
11
+ blockIdx.y * ne0 +
12
+ blockIdx.z * ne0 * gridDim.y;
13
+ if (blockIdx.z < ne02) { // src0
14
+ int offset_src =
15
+ nidx +
16
+ blockIdx.y * ne0 +
17
+ blockIdx.z * ne0 * gridDim.y;
18
+ dst[offset_dst] = x[offset_src];
19
+ } else {
20
+ int offset_src =
21
+ nidx +
22
+ blockIdx.y * ne0 +
23
+ (blockIdx.z - ne02) * ne0 * gridDim.y;
24
+ dst[offset_dst] = y[offset_src];
25
+ }
26
+ }
27
+
28
+ static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
29
+ int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
30
+ dim3 gridDim(num_blocks, ne1, ne2);
31
+ concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
32
+ }
33
+
34
+ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
35
+ const ggml_tensor * src0 = dst->src[0];
36
+ const ggml_tensor * src1 = dst->src[1];
37
+ const float * src0_d = (const float *)src0->data;
38
+ const float * src1_d = (const float *)src1->data;
39
+ float * dst_d = (float *)dst->data;
40
+ cudaStream_t stream = ctx.stream();
41
+
42
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
43
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
44
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
45
+
46
+ for (int i3 = 0; i3 < dst->ne[3]; i3++) {
47
+ concat_f32_cuda(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), dst_d + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], stream);
48
+ }
49
+ }
ggml-cuda/concat.cuh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_CONCAT_BLOCK_SIZE 256
4
+
5
+ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/convert.cu ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "convert.cuh"
2
+ #include "dequantize.cuh"
3
+
4
+ #define CUDA_Q8_0_NE_ALIGN 2048
5
+
6
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
7
+ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
8
+ const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
9
+
10
+ if (i >= k) {
11
+ return;
12
+ }
13
+
14
+ const int ib = i/qk; // block index
15
+ const int iqs = (i%qk)/qr; // quant index
16
+ const int iybs = i - i%qk; // y block start index
17
+ const int y_offset = qr == 1 ? 1 : qk/2;
18
+
19
+ // dequantize
20
+ dfloat2 v;
21
+ dequantize_kernel(vx, ib, iqs, v);
22
+
23
+ y[iybs + iqs + 0] = v.x;
24
+ y[iybs + iqs + y_offset] = v.y;
25
+ }
26
+
27
+ template <bool need_check>
28
+ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
29
+ #if __CUDA_ARCH__ >= CC_PASCAL
30
+ constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
31
+
32
+ const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
33
+ const int * x0 = ((int *) vx) + blockIdx.x * nint;
34
+ half2 * y2 = (half2 *) (y + i0);
35
+
36
+ __shared__ int vals[nint];
37
+
38
+ #pragma unroll
39
+ for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
40
+ if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
41
+ break;
42
+ }
43
+
44
+ const int ix = ix0 + threadIdx.x;
45
+ vals[ix] = x0[ix];
46
+ }
47
+
48
+ #pragma unroll
49
+ for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
50
+ if (need_check && i0 + iy + 2*threadIdx.x >= k) {
51
+ return;
52
+ }
53
+
54
+ const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
55
+ const half d = *b0;
56
+ const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
57
+
58
+ y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
59
+ }
60
+ #else
61
+ GGML_UNUSED(vx);
62
+ GGML_UNUSED(y);
63
+ GGML_UNUSED(k);
64
+ NO_DEVICE_CODE;
65
+ #endif // __CUDA_ARCH__ >= CC_PASCAL
66
+ }
67
+
68
+ template<typename dst_t>
69
+ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
70
+
71
+ const int i = blockIdx.x;
72
+
73
+ // assume 32 threads
74
+ const int tid = threadIdx.x;
75
+ const int il = tid/8;
76
+ const int ir = tid%8;
77
+ const int ib = 8*i + ir;
78
+ if (ib >= nb32) {
79
+ return;
80
+ }
81
+
82
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
83
+
84
+ const block_q4_0 * x = (const block_q4_0 *)vx + ib;
85
+ const float d = __half2float(x->d);
86
+ const float dm = -8*d;
87
+
88
+ const uint8_t * q = x->qs + 4*il;
89
+
90
+ for (int l = 0; l < 4; ++l) {
91
+ y[l+ 0] = d * (q[l] & 0xF) + dm;
92
+ y[l+16] = d * (q[l] >> 4) + dm;
93
+ }
94
+ }
95
+
96
+ template<typename dst_t>
97
+ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
98
+
99
+ const int i = blockIdx.x;
100
+
101
+ // assume 32 threads
102
+ const int tid = threadIdx.x;
103
+ const int il = tid/8;
104
+ const int ir = tid%8;
105
+ const int ib = 8*i + ir;
106
+ if (ib >= nb32) {
107
+ return;
108
+ }
109
+
110
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
111
+
112
+ const block_q4_1 * x = (const block_q4_1 *)vx + ib;
113
+ const float2 d = __half22float2(x->dm);
114
+
115
+ const uint8_t * q = x->qs + 4*il;
116
+
117
+ for (int l = 0; l < 4; ++l) {
118
+ y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
119
+ y[l+16] = d.x * (q[l] >> 4) + d.y;
120
+ }
121
+ }
122
+
123
+ //================================== k-quants
124
+
125
+ template<typename dst_t>
126
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
127
+
128
+ const int i = blockIdx.x;
129
+ const block_q2_K * x = (const block_q2_K *) vx;
130
+
131
+ const int tid = threadIdx.x;
132
+ #if QK_K == 256
133
+ const int n = tid/32;
134
+ const int l = tid - 32*n;
135
+ const int is = 8*n + l/16;
136
+
137
+ const uint8_t q = x[i].qs[32*n + l];
138
+ dst_t * y = yy + i*QK_K + 128*n;
139
+
140
+ float dall = __low2half(x[i].dm);
141
+ float dmin = __high2half(x[i].dm);
142
+ y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
143
+ y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
144
+ y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
145
+ y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
146
+ #else
147
+ const int is = tid/16; // 0 or 1
148
+ const int il = tid%16; // 0...15
149
+ const uint8_t q = x[i].qs[il] >> (2*is);
150
+ dst_t * y = yy + i*QK_K + 16*is + il;
151
+ float dall = __low2half(x[i].dm);
152
+ float dmin = __high2half(x[i].dm);
153
+ y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
154
+ y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
155
+ #endif
156
+
157
+ }
158
+
159
+ template<typename dst_t>
160
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
161
+
162
+ const int i = blockIdx.x;
163
+ const block_q3_K * x = (const block_q3_K *) vx;
164
+
165
+ #if QK_K == 256
166
+ const int r = threadIdx.x/4;
167
+ const int tid = r/2;
168
+ const int is0 = r%2;
169
+ const int l0 = 16*is0 + 4*(threadIdx.x%4);
170
+ const int n = tid / 4;
171
+ const int j = tid - 4*n;
172
+
173
+ uint8_t m = 1 << (4*n + j);
174
+ int is = 8*n + 2*j + is0;
175
+ int shift = 2*j;
176
+
177
+ int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
178
+ is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
179
+ is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
180
+ (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
181
+ float d_all = x[i].d;
182
+ float dl = d_all * (us - 32);
183
+
184
+ dst_t * y = yy + i*QK_K + 128*n + 32*j;
185
+ const uint8_t * q = x[i].qs + 32*n;
186
+ const uint8_t * hm = x[i].hmask;
187
+
188
+ for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
189
+ #else
190
+ const int tid = threadIdx.x;
191
+ const int is = tid/16; // 0 or 1
192
+ const int il = tid%16; // 0...15
193
+ const int im = il/8; // 0...1
194
+ const int in = il%8; // 0...7
195
+
196
+ dst_t * y = yy + i*QK_K + 16*is + il;
197
+
198
+ const uint8_t q = x[i].qs[il] >> (2*is);
199
+ const uint8_t h = x[i].hmask[in] >> (2*is + im);
200
+ const float d = (float)x[i].d;
201
+
202
+ if (is == 0) {
203
+ y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
204
+ y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
205
+ } else {
206
+ y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
207
+ y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
208
+ }
209
+ #endif
210
+
211
+ }
212
+
213
+ #if QK_K == 256
214
+ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
215
+ if (j < 4) {
216
+ d = q[j] & 63; m = q[j + 4] & 63;
217
+ } else {
218
+ d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
219
+ m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
220
+ }
221
+ }
222
+ #endif
223
+
224
+ template<typename dst_t>
225
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
226
+ const block_q4_K * x = (const block_q4_K *) vx;
227
+
228
+ const int i = blockIdx.x;
229
+
230
+ #if QK_K == 256
231
+ // assume 32 threads
232
+ const int tid = threadIdx.x;
233
+ const int il = tid/8;
234
+ const int ir = tid%8;
235
+ const int is = 2*il;
236
+ const int n = 4;
237
+
238
+ dst_t * y = yy + i*QK_K + 64*il + n*ir;
239
+
240
+ const float dall = __low2half(x[i].dm);
241
+ const float dmin = __high2half(x[i].dm);
242
+
243
+ const uint8_t * q = x[i].qs + 32*il + n*ir;
244
+
245
+ uint8_t sc, m;
246
+ get_scale_min_k4(is + 0, x[i].scales, sc, m);
247
+ const float d1 = dall * sc; const float m1 = dmin * m;
248
+ get_scale_min_k4(is + 1, x[i].scales, sc, m);
249
+ const float d2 = dall * sc; const float m2 = dmin * m;
250
+ for (int l = 0; l < n; ++l) {
251
+ y[l + 0] = d1 * (q[l] & 0xF) - m1;
252
+ y[l +32] = d2 * (q[l] >> 4) - m2;
253
+ }
254
+ #else
255
+ const int tid = threadIdx.x;
256
+ const uint8_t * q = x[i].qs;
257
+ dst_t * y = yy + i*QK_K;
258
+ const float d = (float)x[i].dm[0];
259
+ const float m = (float)x[i].dm[1];
260
+ y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
261
+ y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
262
+ #endif
263
+ }
264
+
265
+ template<typename dst_t>
266
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
267
+ const block_q5_K * x = (const block_q5_K *) vx;
268
+
269
+ const int i = blockIdx.x;
270
+
271
+ #if QK_K == 256
272
+ // assume 64 threads - this is very slightly better than the one below
273
+ const int tid = threadIdx.x;
274
+ const int il = tid/16; // il is in 0...3
275
+ const int ir = tid%16; // ir is in 0...15
276
+ const int is = 2*il; // is is in 0...6
277
+
278
+ dst_t * y = yy + i*QK_K + 64*il + 2*ir;
279
+
280
+ const float dall = __low2half(x[i].dm);
281
+ const float dmin = __high2half(x[i].dm);
282
+
283
+ const uint8_t * ql = x[i].qs + 32*il + 2*ir;
284
+ const uint8_t * qh = x[i].qh + 2*ir;
285
+
286
+ uint8_t sc, m;
287
+ get_scale_min_k4(is + 0, x[i].scales, sc, m);
288
+ const float d1 = dall * sc; const float m1 = dmin * m;
289
+ get_scale_min_k4(is + 1, x[i].scales, sc, m);
290
+ const float d2 = dall * sc; const float m2 = dmin * m;
291
+
292
+ uint8_t hm = 1 << (2*il);
293
+ y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
294
+ y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
295
+ hm <<= 1;
296
+ y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
297
+ y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
298
+ #else
299
+ const int tid = threadIdx.x;
300
+ const uint8_t q = x[i].qs[tid];
301
+ const int im = tid/8; // 0...3
302
+ const int in = tid%8; // 0...7
303
+ const int is = tid/16; // 0 or 1
304
+ const uint8_t h = x[i].qh[in] >> im;
305
+ const float d = x[i].d;
306
+ dst_t * y = yy + i*QK_K + tid;
307
+ y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
308
+ y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
309
+ #endif
310
+ }
311
+
312
+ template<typename dst_t>
313
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
314
+ const block_q6_K * x = (const block_q6_K *) vx;
315
+
316
+ const int i = blockIdx.x;
317
+ #if QK_K == 256
318
+
319
+ // assume 64 threads - this is very slightly better than the one below
320
+ const int tid = threadIdx.x;
321
+ const int ip = tid/32; // ip is 0 or 1
322
+ const int il = tid - 32*ip; // 0...32
323
+ const int is = 8*ip + il/16;
324
+
325
+ dst_t * y = yy + i*QK_K + 128*ip + il;
326
+
327
+ const float d = x[i].d;
328
+
329
+ const uint8_t * ql = x[i].ql + 64*ip + il;
330
+ const uint8_t qh = x[i].qh[32*ip + il];
331
+ const int8_t * sc = x[i].scales + is;
332
+
333
+ y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
334
+ y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
335
+ y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
336
+ y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
337
+ #else
338
+
339
+ // assume 32 threads
340
+ const int tid = threadIdx.x;
341
+ const int ip = tid/16; // 0 or 1
342
+ const int il = tid - 16*ip; // 0...15
343
+
344
+ dst_t * y = yy + i*QK_K + 16*ip + il;
345
+
346
+ const float d = x[i].d;
347
+
348
+ const uint8_t ql = x[i].ql[16*ip + il];
349
+ const uint8_t qh = x[i].qh[il] >> (2*ip);
350
+ const int8_t * sc = x[i].scales;
351
+
352
+ y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
353
+ y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
354
+ #endif
355
+ }
356
+
357
+ template<typename dst_t>
358
+ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
359
+
360
+ const int i = blockIdx.x;
361
+ const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
362
+
363
+ const int tid = threadIdx.x;
364
+ #if QK_K == 256
365
+ const int il = tid/8; // 0...3
366
+ const int ib = tid%8; // 0...7
367
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
368
+ const uint16_t * q2 = x[i].qs + 4*ib;
369
+ const uint8_t * aux8 = (const uint8_t *)q2;
370
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
371
+ const uint32_t aux32 = q2[2] | (q2[3] << 16);
372
+ const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
373
+ const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
374
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
375
+ #else
376
+ NO_DEVICE_CODE;
377
+ #endif
378
+
379
+ }
380
+
381
+ template<typename dst_t>
382
+ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
383
+
384
+ const int i = blockIdx.x;
385
+ const block_iq2_xs * x = (const block_iq2_xs *) vx;
386
+
387
+ const int tid = threadIdx.x;
388
+ #if QK_K == 256
389
+ const int il = tid/8; // 0...3
390
+ const int ib = tid%8; // 0...7
391
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
392
+ const uint16_t * q2 = x[i].qs + 4*ib;
393
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
394
+ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
395
+ const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
396
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
397
+ #else
398
+ NO_DEVICE_CODE;
399
+ #endif
400
+
401
+ }
402
+
403
+ template<typename dst_t>
404
+ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
405
+
406
+ const int i = blockIdx.x;
407
+ const block_iq2_s * x = (const block_iq2_s *) vx;
408
+
409
+ const int tid = threadIdx.x;
410
+ #if QK_K == 256
411
+ const int il = tid/8; // 0...3
412
+ const int ib = tid%8; // 0...7
413
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
414
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
415
+ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
416
+ const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
417
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
418
+ #else
419
+ NO_DEVICE_CODE;
420
+ #endif
421
+
422
+ }
423
+
424
+ template<typename dst_t>
425
+ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
426
+
427
+ const int i = blockIdx.x;
428
+ const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
429
+
430
+ const int tid = threadIdx.x;
431
+ #if QK_K == 256
432
+ const int il = tid/8; // 0...3
433
+ const int ib = tid%8; // 0...7
434
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
435
+ const uint8_t * q3 = x[i].qs + 8*ib;
436
+ const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
437
+ const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
438
+ const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
439
+ const uint32_t aux32 = gas[0] | (gas[1] << 16);
440
+ const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
441
+ const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
442
+ for (int j = 0; j < 4; ++j) {
443
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
444
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
445
+ }
446
+ #else
447
+ NO_DEVICE_CODE;
448
+ #endif
449
+
450
+ }
451
+
452
+ template<typename dst_t>
453
+ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
454
+
455
+ const int i = blockIdx.x;
456
+ const block_iq3_s * x = (const block_iq3_s *) vx;
457
+
458
+ const int tid = threadIdx.x;
459
+ #if QK_K == 256
460
+ const int il = tid/8; // 0...3
461
+ const int ib = tid%8; // 0...7
462
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
463
+ const uint8_t * qs = x[i].qs + 8*ib;
464
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
465
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
466
+ const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
467
+ const uint8_t signs = x[i].signs[4*ib + il];
468
+ for (int j = 0; j < 4; ++j) {
469
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
470
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
471
+ }
472
+ #else
473
+ NO_DEVICE_CODE;
474
+ #endif
475
+
476
+ }
477
+
478
+ template<typename dst_t>
479
+ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
480
+
481
+ const int i = blockIdx.x;
482
+ const block_iq1_s * x = (const block_iq1_s *) vx;
483
+
484
+ const int tid = threadIdx.x;
485
+ #if QK_K == 256
486
+ const int il = tid/8; // 0...3
487
+ const int ib = tid%8; // 0...7
488
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
489
+ const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
490
+ const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
491
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
492
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
493
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
494
+ grid32[0] &= 0x0f0f0f0f;
495
+ for (int j = 0; j < 8; ++j) {
496
+ y[j] = d * (q[j] + delta);
497
+ }
498
+ #else
499
+ NO_DEVICE_CODE;
500
+ #endif
501
+
502
+ }
503
+
504
+ template<typename dst_t>
505
+ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
506
+
507
+ const int i = blockIdx.x;
508
+ const block_iq1_m * x = (const block_iq1_m *) vx;
509
+
510
+ const int tid = threadIdx.x;
511
+ #if QK_K == 256
512
+ const int il = tid/8; // 0...3
513
+ const int ib = tid%8; // 0...7
514
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
515
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
516
+ iq1m_scale_t scale;
517
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
518
+ const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
519
+ const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
520
+ const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
521
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
522
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
523
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
524
+ grid32[0] &= 0x0f0f0f0f;
525
+ for (int j = 0; j < 8; ++j) {
526
+ y[j] = d * (q[j] + delta);
527
+ }
528
+ #else
529
+ NO_DEVICE_CODE;
530
+ #endif
531
+
532
+ }
533
+
534
+
535
+ template<typename dst_t>
536
+ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
537
+
538
+ const int i = blockIdx.x;
539
+ const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
540
+
541
+ const int tid = threadIdx.x;
542
+ const int il = tid/8; // 0...3
543
+ const int ib = tid%8; // 0...7
544
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
545
+ const uint8_t * q4 = x[ib].qs + 4*il;
546
+ const float d = (float)x[ib].d;
547
+ for (int j = 0; j < 4; ++j) {
548
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
549
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
550
+ }
551
+
552
+ }
553
+
554
+ #if QK_K != 64
555
+ template<typename dst_t>
556
+ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
557
+ const int i = blockIdx.x;
558
+ const block_iq4_xs * x = (const block_iq4_xs *)vx;
559
+
560
+ const int tid = threadIdx.x;
561
+ const int il = tid/8; // 0...3
562
+ const int ib = tid%8; // 0...7
563
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
564
+ const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
565
+ const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
566
+ for (int j = 0; j < 4; ++j) {
567
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
568
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
569
+ }
570
+ }
571
+ #endif
572
+
573
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
574
+ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
575
+ const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
576
+ dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
577
+ }
578
+
579
+ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
580
+ const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
581
+ if (k % CUDA_Q8_0_NE_ALIGN == 0) {
582
+ const bool need_check = false;
583
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
584
+ } else {
585
+ const bool need_check = true;
586
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
587
+ }
588
+ }
589
+
590
+ template<typename dst_t>
591
+ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
592
+ const int nb = k / QK_K;
593
+ #if QK_K == 256
594
+ dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
595
+ #else
596
+ dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
597
+ #endif
598
+ }
599
+
600
+ template<typename dst_t>
601
+ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
602
+ const int nb = k / QK_K;
603
+ #if QK_K == 256
604
+ dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
605
+ #else
606
+ dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
607
+ #endif
608
+ }
609
+
610
+ template<typename dst_t>
611
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
612
+ const int nb32 = k / 32;
613
+ const int nb = (k + 255) / 256;
614
+ dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
615
+ }
616
+
617
+ template<typename dst_t>
618
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
619
+ const int nb32 = k / 32;
620
+ const int nb = (k + 255) / 256;
621
+ dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
622
+ }
623
+
624
+ template<typename dst_t>
625
+ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
626
+ const int nb = k / QK_K;
627
+ dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
628
+ }
629
+
630
+ template<typename dst_t>
631
+ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
632
+ const int nb = k / QK_K;
633
+ #if QK_K == 256
634
+ dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
635
+ #else
636
+ dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
637
+ #endif
638
+ }
639
+
640
+ template<typename dst_t>
641
+ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
642
+ const int nb = k / QK_K;
643
+ #if QK_K == 256
644
+ dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
645
+ #else
646
+ dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
647
+ #endif
648
+ }
649
+
650
+ template<typename dst_t>
651
+ static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
652
+ const int nb = k / QK_K;
653
+ dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
654
+ }
655
+
656
+ template<typename dst_t>
657
+ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
658
+ const int nb = k / QK_K;
659
+ dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
660
+ }
661
+
662
+ template<typename dst_t>
663
+ static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
664
+ const int nb = k / QK_K;
665
+ dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
666
+ }
667
+
668
+ template<typename dst_t>
669
+ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
670
+ const int nb = k / QK_K;
671
+ dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
672
+ }
673
+
674
+ template<typename dst_t>
675
+ static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
676
+ const int nb = k / QK_K;
677
+ dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
678
+ }
679
+
680
+ template<typename dst_t>
681
+ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
682
+ const int nb = k / QK_K;
683
+ dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
684
+ }
685
+
686
+ template<typename dst_t>
687
+ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
688
+ const int nb = (k + QK_K - 1) / QK_K;
689
+ dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
690
+ }
691
+
692
+ template<typename dst_t>
693
+ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
694
+ const int nb = k / QK_K;
695
+ dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
696
+ }
697
+
698
+ template<typename dst_t>
699
+ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
700
+ const int nb = (k + QK_K - 1) / QK_K;
701
+ #if QK_K == 64
702
+ dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
703
+ #else
704
+ dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
705
+ #endif
706
+ }
707
+
708
+ template <typename src_t, typename dst_t>
709
+ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
710
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
711
+
712
+ if (i >= k) {
713
+ return;
714
+ }
715
+
716
+ const src_t * x = (src_t *) vx;
717
+
718
+ y[i] = x[i];
719
+ }
720
+
721
+ template <typename src_t, typename dst_t>
722
+ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
723
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
724
+ convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
725
+ }
726
+
727
+ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
728
+ int id;
729
+ switch (type) {
730
+ case GGML_TYPE_Q4_0:
731
+ return dequantize_row_q4_0_cuda;
732
+ case GGML_TYPE_Q4_1:
733
+ return dequantize_row_q4_1_cuda;
734
+ case GGML_TYPE_Q5_0:
735
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
736
+ case GGML_TYPE_Q5_1:
737
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
738
+ case GGML_TYPE_Q8_0:
739
+ CUDA_CHECK(cudaGetDevice(&id));
740
+ if (ggml_cuda_info().devices[id].cc >= CC_PASCAL) {
741
+ return dequantize_block_q8_0_f16_cuda;
742
+ }
743
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
744
+ case GGML_TYPE_Q2_K:
745
+ return dequantize_row_q2_K_cuda;
746
+ case GGML_TYPE_Q3_K:
747
+ return dequantize_row_q3_K_cuda;
748
+ case GGML_TYPE_Q4_K:
749
+ return dequantize_row_q4_K_cuda;
750
+ case GGML_TYPE_Q5_K:
751
+ return dequantize_row_q5_K_cuda;
752
+ case GGML_TYPE_Q6_K:
753
+ return dequantize_row_q6_K_cuda;
754
+ case GGML_TYPE_IQ2_XXS:
755
+ return dequantize_row_iq2_xxs_cuda;
756
+ case GGML_TYPE_IQ2_XS:
757
+ return dequantize_row_iq2_xs_cuda;
758
+ case GGML_TYPE_IQ2_S:
759
+ return dequantize_row_iq2_s_cuda;
760
+ case GGML_TYPE_IQ3_XXS:
761
+ return dequantize_row_iq3_xxs_cuda;
762
+ case GGML_TYPE_IQ1_S:
763
+ return dequantize_row_iq1_s_cuda;
764
+ case GGML_TYPE_IQ1_M:
765
+ return dequantize_row_iq1_m_cuda;
766
+ case GGML_TYPE_IQ4_NL:
767
+ return dequantize_row_iq4_nl_cuda;
768
+ case GGML_TYPE_IQ4_XS:
769
+ return dequantize_row_iq4_xs_cuda;
770
+ case GGML_TYPE_IQ3_S:
771
+ return dequantize_row_iq3_s_cuda;
772
+ case GGML_TYPE_F32:
773
+ return convert_unary_cuda<float>;
774
+ default:
775
+ return nullptr;
776
+ }
777
+ }
778
+
779
+ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
780
+ switch (type) {
781
+ case GGML_TYPE_Q4_0:
782
+ return dequantize_row_q4_0_cuda;
783
+ case GGML_TYPE_Q4_1:
784
+ return dequantize_row_q4_1_cuda;
785
+ case GGML_TYPE_Q5_0:
786
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
787
+ case GGML_TYPE_Q5_1:
788
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
789
+ case GGML_TYPE_Q8_0:
790
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
791
+ case GGML_TYPE_Q2_K:
792
+ return dequantize_row_q2_K_cuda;
793
+ case GGML_TYPE_Q3_K:
794
+ return dequantize_row_q3_K_cuda;
795
+ case GGML_TYPE_Q4_K:
796
+ return dequantize_row_q4_K_cuda;
797
+ case GGML_TYPE_Q5_K:
798
+ return dequantize_row_q5_K_cuda;
799
+ case GGML_TYPE_Q6_K:
800
+ return dequantize_row_q6_K_cuda;
801
+ case GGML_TYPE_IQ2_XXS:
802
+ return dequantize_row_iq2_xxs_cuda;
803
+ case GGML_TYPE_IQ2_XS:
804
+ return dequantize_row_iq2_xs_cuda;
805
+ case GGML_TYPE_IQ2_S:
806
+ return dequantize_row_iq2_s_cuda;
807
+ case GGML_TYPE_IQ3_XXS:
808
+ return dequantize_row_iq3_xxs_cuda;
809
+ case GGML_TYPE_IQ1_S:
810
+ return dequantize_row_iq1_s_cuda;
811
+ case GGML_TYPE_IQ1_M:
812
+ return dequantize_row_iq1_m_cuda;
813
+ case GGML_TYPE_IQ4_NL:
814
+ return dequantize_row_iq4_nl_cuda;
815
+ case GGML_TYPE_IQ4_XS:
816
+ return dequantize_row_iq4_xs_cuda;
817
+ case GGML_TYPE_IQ3_S:
818
+ return dequantize_row_iq3_s_cuda;
819
+ case GGML_TYPE_F16:
820
+ return convert_unary_cuda<half>;
821
+ default:
822
+ return nullptr;
823
+ }
824
+ }
ggml-cuda/convert.cuh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
4
+
5
+ template<typename T>
6
+ using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
7
+
8
+ typedef to_t_cuda_t<float> to_fp32_cuda_t;
9
+ typedef to_t_cuda_t<half> to_fp16_cuda_t;
10
+
11
+ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
12
+
13
+ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
ggml-cuda/cpy.cu ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "cpy.cuh"
2
+
3
+ typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
4
+
5
+ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
6
+ const float * xi = (const float *) cxi;
7
+ float * dsti = (float *) cdsti;
8
+
9
+ *dsti = *xi;
10
+ }
11
+
12
+ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
13
+ const float * xi = (const float *) cxi;
14
+ half * dsti = (half *) cdsti;
15
+
16
+ *dsti = __float2half(*xi);
17
+ }
18
+
19
+ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
20
+ const half * xi = (const half *) cxi;
21
+ half * dsti = (half *) cdsti;
22
+
23
+ *dsti = *xi;
24
+ }
25
+
26
+ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
27
+ const half * xi = (const half *) cxi;
28
+ float * dsti = (float *) cdsti;
29
+
30
+ *dsti = *xi;
31
+ }
32
+
33
+ template <cpy_kernel_t cpy_1>
34
+ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
35
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
36
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
37
+ const int nb12, const int nb13) {
38
+ const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
39
+
40
+ if (i >= ne) {
41
+ return;
42
+ }
43
+
44
+ // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
45
+ // then combine those indices with the corresponding byte offsets to get the total offsets
46
+ const int64_t i03 = i/(ne00 * ne01 * ne02);
47
+ const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
48
+ const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
49
+ const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
50
+ const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
51
+
52
+ const int64_t i13 = i/(ne10 * ne11 * ne12);
53
+ const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
54
+ const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
55
+ const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
56
+ const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
57
+
58
+ cpy_1(cx + x_offset, cdst + dst_offset);
59
+ }
60
+
61
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
62
+ const float * xi = (const float *) cxi;
63
+ block_q8_0 * dsti = (block_q8_0 *) cdsti;
64
+
65
+ float amax = 0.0f; // absolute max
66
+
67
+ for (int j = 0; j < QK8_0; j++) {
68
+ const float v = xi[j];
69
+ amax = fmaxf(amax, fabsf(v));
70
+ }
71
+
72
+ const float d = amax / ((1 << 7) - 1);
73
+ const float id = d ? 1.0f/d : 0.0f;
74
+
75
+ dsti->d = d;
76
+
77
+ for (int j = 0; j < QK8_0; ++j) {
78
+ const float x0 = xi[j]*id;
79
+
80
+ dsti->qs[j] = roundf(x0);
81
+ }
82
+ }
83
+
84
+ static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
85
+ const float * xi = (const float *) cxi;
86
+ block_q4_0 * dsti = (block_q4_0 *) cdsti;
87
+
88
+ float amax = 0.0f;
89
+ float vmax = 0.0f;
90
+
91
+ for (int j = 0; j < QK4_0; ++j) {
92
+ const float v = xi[j];
93
+ if (amax < fabsf(v)) {
94
+ amax = fabsf(v);
95
+ vmax = v;
96
+ }
97
+ }
98
+
99
+ const float d = vmax / -8;
100
+ const float id = d ? 1.0f/d : 0.0f;
101
+
102
+ dsti->d = d;
103
+
104
+ for (int j = 0; j < QK4_0/2; ++j) {
105
+ const float x0 = xi[0 + j]*id;
106
+ const float x1 = xi[QK4_0/2 + j]*id;
107
+
108
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
109
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
110
+
111
+ dsti->qs[j] = xi0;
112
+ dsti->qs[j] |= xi1 << 4;
113
+ }
114
+ }
115
+
116
+ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
117
+ const float * xi = (const float *) cxi;
118
+ block_q4_1 * dsti = (block_q4_1 *) cdsti;
119
+
120
+ float vmin = FLT_MAX;
121
+ float vmax = -FLT_MAX;
122
+
123
+ for (int j = 0; j < QK4_1; ++j) {
124
+ const float v = xi[j];
125
+
126
+ if (v < vmin) vmin = v;
127
+ if (v > vmax) vmax = v;
128
+ }
129
+
130
+ const float d = (vmax - vmin) / ((1 << 4) - 1);
131
+ const float id = d ? 1.0f/d : 0.0f;
132
+
133
+ dsti->dm.x = d;
134
+ dsti->dm.y = vmin;
135
+
136
+ for (int j = 0; j < QK4_1/2; ++j) {
137
+ const float x0 = (xi[0 + j] - vmin)*id;
138
+ const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
139
+
140
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
141
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
142
+
143
+ dsti->qs[j] = xi0;
144
+ dsti->qs[j] |= xi1 << 4;
145
+ }
146
+ }
147
+
148
+ static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
149
+ const float * xi = (const float *) cxi;
150
+ block_q5_0 * dsti = (block_q5_0 *) cdsti;
151
+
152
+ float amax = 0.0f;
153
+ float vmax = 0.0f;
154
+
155
+ for (int j = 0; j < QK5_0; ++j) {
156
+ const float v = xi[j];
157
+ if (amax < fabsf(v)) {
158
+ amax = fabsf(v);
159
+ vmax = v;
160
+ }
161
+ }
162
+
163
+ const float d = vmax / -16;
164
+ const float id = d ? 1.0f/d : 0.0f;
165
+
166
+ dsti->d = d;
167
+
168
+ uint32_t qh = 0;
169
+ for (int j = 0; j < QK5_0/2; ++j) {
170
+ const float x0 = xi[0 + j]*id;
171
+ const float x1 = xi[QK5_0/2 + j]*id;
172
+
173
+ const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
174
+ const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
175
+
176
+ dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
177
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
178
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
179
+ }
180
+ memcpy(dsti->qh, &qh, sizeof(qh));
181
+ }
182
+
183
+ static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
184
+ const float * xi = (const float *) cxi;
185
+ block_q5_1 * dsti = (block_q5_1 *) cdsti;
186
+
187
+ float min = xi[0];
188
+ float max = xi[0];
189
+
190
+ for (int j = 1; j < QK5_1; ++j) {
191
+ const float v = xi[j];
192
+ min = v < min ? v : min;
193
+ max = v > max ? v : max;
194
+ }
195
+
196
+ const float d = (max - min) / 31;
197
+ const float id = d ? 1.0f/d : 0.0f;
198
+
199
+ dsti->dm.x = d;
200
+ dsti->dm.y = min;
201
+
202
+ uint32_t qh = 0;
203
+ for (int j = 0; j < QK5_1/2; ++j) {
204
+ const float x0 = (xi[0 + j] - min)*id;
205
+ const float x1 = (xi[QK5_1/2 + j] - min)*id;
206
+
207
+ const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
208
+ const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
209
+
210
+ dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
211
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
212
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
213
+ }
214
+ memcpy(dsti->qh, &qh, sizeof(qh));
215
+ }
216
+
217
+
218
+ static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
219
+ if (x <= val[0]) return 0;
220
+ if (x >= val[n-1]) return n-1;
221
+ int ml = 0, mu = n-1;
222
+ while (mu-ml > 1) {
223
+ int mav = (ml+mu)/2;
224
+ if (x < val[mav]) mu = mav; else ml = mav;
225
+ }
226
+ return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
227
+ }
228
+
229
+ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
230
+ const float * xi = (const float *) cxi;
231
+ block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
232
+
233
+ float amax = 0.0f;
234
+ float vmax = 0.0f;
235
+
236
+ for (int j = 0; j < QK4_NL; ++j) {
237
+ const float v = xi[j];
238
+ if (amax < fabsf(v)) {
239
+ amax = fabsf(v);
240
+ vmax = v;
241
+ }
242
+ }
243
+
244
+ float d = vmax / kvalues_iq4nl[0];
245
+ const float id = d ? 1.0f/d : 0.0f;
246
+
247
+ float sumqx = 0, sumq2 = 0;
248
+ for (int j = 0; j < QK4_NL/2; ++j) {
249
+ const float x0 = xi[0 + j]*id;
250
+ const float x1 = xi[QK4_NL/2 + j]*id;
251
+ const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
252
+ const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
253
+ dsti->qs[j] = xi0 | (xi1 << 4);
254
+ const float v0 = kvalues_iq4nl[xi0];
255
+ const float v1 = kvalues_iq4nl[xi1];
256
+ const float w0 = xi[0 + j]*xi[0 + j];
257
+ const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j];
258
+ sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j];
259
+ sumq2 += w0*v0*v0 + w1*v1*v1;
260
+ }
261
+
262
+ dsti->d = sumq2 > 0 ? sumqx/sumq2 : d;
263
+ }
264
+
265
+ template <cpy_kernel_t cpy_blck, int qk>
266
+ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
267
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
268
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
269
+ const int nb12, const int nb13) {
270
+ const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
271
+
272
+ if (i >= ne) {
273
+ return;
274
+ }
275
+
276
+ const int i03 = i/(ne00 * ne01 * ne02);
277
+ const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
278
+ const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
279
+ const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
280
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
281
+
282
+ const int i13 = i/(ne10 * ne11 * ne12);
283
+ const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
284
+ const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
285
+ const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
286
+ const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
287
+
288
+ cpy_blck(cx + x_offset, cdst + dst_offset);
289
+ }
290
+
291
+ static void ggml_cpy_f16_f32_cuda(
292
+ const char * cx, char * cdst, const int ne,
293
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
294
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
295
+
296
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
297
+ cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
298
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
299
+ }
300
+
301
+ static void ggml_cpy_f32_f32_cuda(
302
+ const char * cx, char * cdst, const int ne,
303
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
304
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
305
+
306
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
307
+ cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
308
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
309
+ }
310
+
311
+ static void ggml_cpy_f32_f16_cuda(
312
+ const char * cx, char * cdst, const int ne,
313
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
314
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
315
+
316
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
317
+ cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
318
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
319
+ }
320
+
321
+ static void ggml_cpy_f32_q8_0_cuda(
322
+ const char * cx, char * cdst, const int ne,
323
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
324
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
325
+
326
+ GGML_ASSERT(ne % QK8_0 == 0);
327
+ const int num_blocks = ne / QK8_0;
328
+ cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
329
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
330
+ }
331
+
332
+ static void ggml_cpy_f32_q4_0_cuda(
333
+ const char * cx, char * cdst, const int ne,
334
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
335
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
336
+
337
+ GGML_ASSERT(ne % QK4_0 == 0);
338
+ const int num_blocks = ne / QK4_0;
339
+ cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
340
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
341
+ }
342
+
343
+ static void ggml_cpy_f32_q4_1_cuda(
344
+ const char * cx, char * cdst, const int ne,
345
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
346
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
347
+
348
+ GGML_ASSERT(ne % QK4_1 == 0);
349
+ const int num_blocks = ne / QK4_1;
350
+ cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
351
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
352
+ }
353
+
354
+ static void ggml_cpy_f32_q5_0_cuda(
355
+ const char * cx, char * cdst, const int ne,
356
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
357
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
358
+
359
+ GGML_ASSERT(ne % QK5_0 == 0);
360
+ const int num_blocks = ne / QK5_0;
361
+ cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
362
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
363
+ }
364
+
365
+ static void ggml_cpy_f32_q5_1_cuda(
366
+ const char * cx, char * cdst, const int ne,
367
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
368
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
369
+
370
+ GGML_ASSERT(ne % QK5_1 == 0);
371
+ const int num_blocks = ne / QK5_1;
372
+ cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
373
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
374
+ }
375
+
376
+ static void ggml_cpy_f32_iq4_nl_cuda(
377
+ const char * cx, char * cdst, const int ne,
378
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
379
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
380
+
381
+ GGML_ASSERT(ne % QK4_NL == 0);
382
+ const int num_blocks = ne / QK4_NL;
383
+ cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
384
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
385
+ }
386
+
387
+ static void ggml_cpy_f16_f16_cuda(
388
+ const char * cx, char * cdst, const int ne,
389
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
390
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
391
+
392
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
393
+ cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
394
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
395
+ }
396
+
397
+ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
398
+ const int64_t ne = ggml_nelements(src0);
399
+ GGML_ASSERT(ne == ggml_nelements(src1));
400
+
401
+ GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
402
+ GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
403
+
404
+ const int64_t ne00 = src0->ne[0];
405
+ const int64_t ne01 = src0->ne[1];
406
+ const int64_t ne02 = src0->ne[2];
407
+
408
+ //GGML_ASSERT(src0->ne[3] == 1);
409
+
410
+ const int64_t nb00 = src0->nb[0];
411
+ const int64_t nb01 = src0->nb[1];
412
+ const int64_t nb02 = src0->nb[2];
413
+ const int64_t nb03 = src0->nb[3];
414
+
415
+ const int64_t ne10 = src1->ne[0];
416
+ const int64_t ne11 = src1->ne[1];
417
+ const int64_t ne12 = src1->ne[2];
418
+
419
+ //GGML_ASSERT(src1->ne[3] == 1);
420
+
421
+ const int64_t nb10 = src1->nb[0];
422
+ const int64_t nb11 = src1->nb[1];
423
+ const int64_t nb12 = src1->nb[2];
424
+ const int64_t nb13 = src1->nb[3];
425
+
426
+ cudaStream_t main_stream = ctx.stream();
427
+
428
+ char * src0_ddc = (char *) src0->data;
429
+ char * src1_ddc = (char *) src1->data;
430
+
431
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
432
+ ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
433
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
434
+ ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
435
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
436
+ ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
437
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
438
+ ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
439
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
440
+ ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
441
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
442
+ ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
443
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
444
+ ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
445
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
446
+ ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
447
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
448
+ ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
449
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
450
+ ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
451
+ } else {
452
+ fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
453
+ ggml_type_name(src0->type), ggml_type_name(src1->type));
454
+ GGML_ASSERT(false);
455
+ }
456
+ }
457
+
458
+ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
459
+ const ggml_tensor * src0 = dst->src[0];
460
+ ggml_cuda_cpy(ctx, src0, dst);
461
+ }
ggml-cuda/cpy.cuh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_CPY_BLOCK_SIZE 32
4
+
5
+ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
6
+
7
+ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/dequantize.cuh ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
4
+ const block_q4_0 * x = (const block_q4_0 *) vx;
5
+
6
+ const dfloat d = x[ib].d;
7
+
8
+ const int vui = x[ib].qs[iqs];
9
+
10
+ v.x = vui & 0xF;
11
+ v.y = vui >> 4;
12
+
13
+ #ifdef GGML_CUDA_F16
14
+ v = __hsub2(v, {8.0f, 8.0f});
15
+ v = __hmul2(v, {d, d});
16
+ #else
17
+ v.x = (v.x - 8.0f) * d;
18
+ v.y = (v.y - 8.0f) * d;
19
+ #endif // GGML_CUDA_F16
20
+ }
21
+
22
+ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
23
+ const block_q4_1 * x = (const block_q4_1 *) vx;
24
+
25
+ const dfloat d = __low2half(x[ib].dm);
26
+ const dfloat m = __high2half(x[ib].dm);
27
+
28
+ const int vui = x[ib].qs[iqs];
29
+
30
+ v.x = vui & 0xF;
31
+ v.y = vui >> 4;
32
+
33
+ #ifdef GGML_CUDA_F16
34
+ v = __hmul2(v, {d, d});
35
+ v = __hadd2(v, {m, m});
36
+ #else
37
+ v.x = (v.x * d) + m;
38
+ v.y = (v.y * d) + m;
39
+ #endif // GGML_CUDA_F16
40
+ }
41
+
42
+ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
43
+ const block_q5_0 * x = (const block_q5_0 *) vx;
44
+
45
+ const dfloat d = x[ib].d;
46
+
47
+ uint32_t qh;
48
+ memcpy(&qh, x[ib].qh, sizeof(qh));
49
+
50
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
51
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
52
+
53
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
54
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
55
+
56
+ #ifdef GGML_CUDA_F16
57
+ v = __hsub2(v, {16.0f, 16.0f});
58
+ v = __hmul2(v, {d, d});
59
+ #else
60
+ v.x = (v.x - 16.0f) * d;
61
+ v.y = (v.y - 16.0f) * d;
62
+ #endif // GGML_CUDA_F16
63
+ }
64
+
65
+ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
66
+ const block_q5_1 * x = (const block_q5_1 *) vx;
67
+
68
+ const dfloat d = __low2half(x[ib].dm);
69
+ const dfloat m = __high2half(x[ib].dm);
70
+
71
+ uint32_t qh;
72
+ memcpy(&qh, x[ib].qh, sizeof(qh));
73
+
74
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
75
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
76
+
77
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
78
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
79
+
80
+ #ifdef GGML_CUDA_F16
81
+ v = __hmul2(v, {d, d});
82
+ v = __hadd2(v, {m, m});
83
+ #else
84
+ v.x = (v.x * d) + m;
85
+ v.y = (v.y * d) + m;
86
+ #endif // GGML_CUDA_F16
87
+ }
88
+
89
+ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
90
+ const block_q8_0 * x = (const block_q8_0 *) vx;
91
+
92
+ const dfloat d = x[ib].d;
93
+
94
+ v.x = x[ib].qs[iqs + 0];
95
+ v.y = x[ib].qs[iqs + 1];
96
+
97
+ #ifdef GGML_CUDA_F16
98
+ v = __hmul2(v, {d, d});
99
+ #else
100
+ v.x *= d;
101
+ v.y *= d;
102
+ #endif // GGML_CUDA_F16
103
+ }
ggml-cuda/diagmask.cu ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "diagmask.cuh"
2
+
3
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4
+ const int col = blockDim.y*blockIdx.y + threadIdx.y;
5
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
6
+
7
+ if (col >= ncols) {
8
+ return;
9
+ }
10
+
11
+ const int i = row*ncols + col;
12
+ //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
13
+ //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
14
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
15
+ }
16
+
17
+ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
18
+ const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
19
+ const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
20
+ const dim3 block_nums(nrows_x, block_num_x, 1);
21
+ diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
22
+ }
23
+
24
+ void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
25
+ const ggml_tensor * src0 = dst->src[0];
26
+ const float * src0_d = (const float *)src0->data;
27
+ float * dst_d = (float *)dst->data;
28
+ cudaStream_t stream = ctx.stream();
29
+
30
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
31
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
32
+
33
+ const int64_t ne00 = src0->ne[0];
34
+ const int64_t ne01 = src0->ne[1];
35
+ const int nrows0 = ggml_nrows(src0);
36
+
37
+ const int n_past = ((int32_t *) dst->op_params)[0];
38
+
39
+ diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);
40
+ }
ggml-cuda/diagmask.cuh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
4
+
5
+ void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/dmmv.cu ADDED
@@ -0,0 +1,817 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "dmmv.cuh"
2
+ #include "dequantize.cuh"
3
+ #include "convert.cuh"
4
+
5
+ #ifndef GGML_CUDA_MMV_Y
6
+ #define GGML_CUDA_MMV_Y 1
7
+ #endif
8
+
9
+ #ifndef K_QUANTS_PER_ITERATION
10
+ #define K_QUANTS_PER_ITERATION 2
11
+ #else
12
+ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
13
+ #endif
14
+
15
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
16
+
17
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
18
+
19
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
20
+ if (row > nrows) return;
21
+
22
+ const int num_blocks_per_row = ncols / QK_K;
23
+ const int ib0 = row*num_blocks_per_row;
24
+
25
+ const block_q2_K * x = (const block_q2_K *)vx + ib0;
26
+
27
+ float tmp = 0; // partial sum for thread in warp
28
+
29
+ #if QK_K == 256
30
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
31
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
32
+
33
+ const int step = 16/K_QUANTS_PER_ITERATION;
34
+
35
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
36
+ const int in = tid - step*im; // 0...15 or 0...7
37
+
38
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
39
+ const int q_offset = 32*im + l0;
40
+ const int s_offset = 8*im;
41
+ const int y_offset = 128*im + l0;
42
+
43
+ uint32_t aux[4];
44
+ const uint8_t * d = (const uint8_t *)aux;
45
+ const uint8_t * m = (const uint8_t *)(aux + 2);
46
+
47
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
48
+
49
+ const float * y = yy + i * QK_K + y_offset;
50
+ const uint8_t * q = x[i].qs + q_offset;
51
+
52
+ const float dall = __low2half(x[i].dm);
53
+ const float dmin = __high2half(x[i].dm);
54
+
55
+ const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
56
+ aux[0] = a[0] & 0x0f0f0f0f;
57
+ aux[1] = a[1] & 0x0f0f0f0f;
58
+ aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
59
+ aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
60
+
61
+ float sum1 = 0, sum2 = 0;
62
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
63
+ sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
64
+ + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
65
+ + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
66
+ + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
67
+ + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
68
+ + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
69
+ + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
70
+ +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
71
+ sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
72
+ + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
73
+
74
+ }
75
+ tmp += dall * sum1 - dmin * sum2;
76
+
77
+ }
78
+ #else
79
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
80
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
81
+ const int offset = tid * K_QUANTS_PER_ITERATION;
82
+
83
+ uint32_t uaux[2];
84
+ const uint8_t * d = (const uint8_t *)uaux;
85
+
86
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
87
+
88
+ const float * y = yy + i * QK_K + offset;
89
+ const uint8_t * q = x[i].qs + offset;
90
+ const uint32_t * s = (const uint32_t *)x[i].scales;
91
+
92
+ uaux[0] = s[0] & 0x0f0f0f0f;
93
+ uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
94
+
95
+ const float2 dall = __half22float2(x[i].dm);
96
+
97
+ float sum1 = 0, sum2 = 0;
98
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
99
+ const uint8_t ql = q[l];
100
+ sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
101
+ + y[l+16] * d[1] * ((ql >> 2) & 3)
102
+ + y[l+32] * d[2] * ((ql >> 4) & 3)
103
+ + y[l+48] * d[3] * ((ql >> 6) & 3);
104
+ sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
105
+ }
106
+ tmp += dall.x * sum1 - dall.y * sum2;
107
+ }
108
+ #endif
109
+
110
+ // sum up partial sums and write back result
111
+ tmp = warp_reduce_sum(tmp);
112
+
113
+ if (threadIdx.x == 0) {
114
+ dst[row] = tmp;
115
+ }
116
+ }
117
+
118
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
119
+
120
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
121
+ if (row > nrows) return;
122
+
123
+ const int num_blocks_per_row = ncols / QK_K;
124
+ const int ib0 = row*num_blocks_per_row;
125
+
126
+ const block_q3_K * x = (const block_q3_K *)vx + ib0;
127
+
128
+ float tmp = 0; // partial sum for thread in warp
129
+
130
+ #if QK_K == 256
131
+
132
+ const uint16_t kmask1 = 0x0303;
133
+ const uint16_t kmask2 = 0x0f0f;
134
+
135
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
136
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
137
+
138
+ const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
139
+ const int step = 16/K_QUANTS_PER_ITERATION;
140
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
141
+ const int in = tid - step*im; // 0....15 or 0...7
142
+
143
+ const uint8_t m = 1 << (4*im);
144
+
145
+ const int l0 = n*in; // 0...15 or 0...14 in steps of 2
146
+ const int q_offset = 32*im + l0;
147
+ const int y_offset = 128*im + l0;
148
+
149
+ uint16_t utmp[4];
150
+ const int8_t * s = (const int8_t *)utmp;
151
+
152
+ const uint16_t s_shift = 4*im;
153
+
154
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
155
+
156
+ const float * y = yy + i * QK_K + y_offset;
157
+ const uint8_t * q = x[i].qs + q_offset;
158
+ const uint8_t * h = x[i].hmask + l0;
159
+
160
+ const uint16_t * a = (const uint16_t *)x[i].scales;
161
+ utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
162
+ utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
163
+ utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
164
+ utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
165
+
166
+ const float d = x[i].d;
167
+
168
+ float sum = 0;
169
+ for (int l = 0; l < n; ++l) {
170
+ sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
171
+ + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
172
+ + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
173
+ + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
174
+ sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
175
+ + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
176
+ + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
177
+ + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
178
+ }
179
+ tmp += d * sum;
180
+
181
+ }
182
+ #else
183
+
184
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
185
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
186
+ const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
187
+ const int in = offset/8; // 0 or 1
188
+ const int im = offset%8; // 0...7
189
+
190
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
191
+
192
+ const float * y = yy + i * QK_K + offset;
193
+ const uint8_t * q = x[i].qs + offset;
194
+ const uint8_t * s = x[i].scales;
195
+
196
+ const float dall = (float)x[i].d;
197
+
198
+ float sum = 0;
199
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
200
+ const uint8_t hl = x[i].hmask[im+l] >> in;
201
+ const uint8_t ql = q[l];
202
+ sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
203
+ + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
204
+ + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
205
+ + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
206
+ }
207
+ tmp += sum;
208
+ }
209
+ #endif
210
+
211
+ // sum up partial sums and write back result
212
+ tmp = warp_reduce_sum(tmp);
213
+
214
+ if (threadIdx.x == 0) {
215
+ dst[row] = tmp;
216
+ }
217
+ }
218
+
219
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
220
+
221
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
222
+ if (row > nrows) return;
223
+ const int num_blocks_per_row = ncols / QK_K;
224
+ const int ib0 = row*num_blocks_per_row;
225
+
226
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
227
+
228
+ #if QK_K == 256
229
+ const uint16_t kmask1 = 0x3f3f;
230
+ const uint16_t kmask2 = 0x0f0f;
231
+ const uint16_t kmask3 = 0xc0c0;
232
+
233
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
234
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
235
+
236
+ const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
237
+
238
+ const int il = tid/step; // 0...3
239
+ const int ir = tid - step*il; // 0...7 or 0...3
240
+ const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
241
+
242
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
243
+ const int in = il%2;
244
+
245
+ const int l0 = n*(2*ir + in);
246
+ const int q_offset = 32*im + l0;
247
+ const int y_offset = 64*im + l0;
248
+
249
+ uint16_t aux[4];
250
+ const uint8_t * sc = (const uint8_t *)aux;
251
+
252
+ #if K_QUANTS_PER_ITERATION == 2
253
+ uint32_t q32[4];
254
+ const uint8_t * q4 = (const uint8_t *)q32;
255
+ #else
256
+ uint16_t q16[4];
257
+ const uint8_t * q4 = (const uint8_t *)q16;
258
+ #endif
259
+
260
+ float tmp = 0; // partial sum for thread in warp
261
+
262
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
263
+
264
+ const float * y1 = yy + i*QK_K + y_offset;
265
+ const float * y2 = y1 + 128;
266
+
267
+ const float dall = __low2half(x[i].dm);
268
+ const float dmin = __high2half(x[i].dm);
269
+
270
+ const uint16_t * a = (const uint16_t *)x[i].scales;
271
+ aux[0] = a[im+0] & kmask1;
272
+ aux[1] = a[im+2] & kmask1;
273
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
274
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
275
+
276
+ #if K_QUANTS_PER_ITERATION == 2
277
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
278
+ const uint32_t * q2 = q1 + 16;
279
+
280
+ q32[0] = q1[0] & 0x0f0f0f0f;
281
+ q32[1] = q1[0] & 0xf0f0f0f0;
282
+ q32[2] = q2[0] & 0x0f0f0f0f;
283
+ q32[3] = q2[0] & 0xf0f0f0f0;
284
+
285
+ float4 s = {0.f, 0.f, 0.f, 0.f};
286
+ float smin = 0;
287
+ for (int l = 0; l < 4; ++l) {
288
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
289
+ s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
290
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
291
+ }
292
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
293
+ #else
294
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
295
+ const uint16_t * q2 = q1 + 32;
296
+
297
+ q16[0] = q1[0] & 0x0f0f;
298
+ q16[1] = q1[0] & 0xf0f0;
299
+ q16[2] = q2[0] & 0x0f0f;
300
+ q16[3] = q2[0] & 0xf0f0;
301
+
302
+ float4 s = {0.f, 0.f, 0.f, 0.f};
303
+ float smin = 0;
304
+ for (int l = 0; l < 2; ++l) {
305
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
306
+ s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
307
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
308
+ }
309
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
310
+ #endif
311
+
312
+ }
313
+ #else
314
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
315
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
316
+
317
+ const int step = tid * K_QUANTS_PER_ITERATION;
318
+
319
+ uint16_t aux16[2];
320
+ const uint8_t * s = (const uint8_t *)aux16;
321
+
322
+ float tmp = 0;
323
+
324
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
325
+ const uint8_t * q = x[i].qs + step;
326
+ const float * y = yy + i*QK_K + step;
327
+ const uint16_t * a = (const uint16_t *)x[i].scales;
328
+ aux16[0] = a[0] & 0x0f0f;
329
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
330
+ const float d = (float)x[i].dm[0];
331
+ const float m = (float)x[i].dm[1];
332
+ float sum = 0.f;
333
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
334
+ sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
335
+ + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
336
+ + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
337
+ + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
338
+ }
339
+ tmp += sum;
340
+ }
341
+
342
+ #endif
343
+
344
+ // sum up partial sums and write back result
345
+ tmp = warp_reduce_sum(tmp);
346
+
347
+ if (tid == 0) {
348
+ dst[row] = tmp;
349
+ }
350
+ }
351
+
352
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
353
+
354
+ const int row = blockIdx.x;
355
+ const int num_blocks_per_row = ncols / QK_K;
356
+ const int ib0 = row*num_blocks_per_row;
357
+
358
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
359
+
360
+ float tmp = 0; // partial sum for thread in warp
361
+
362
+ #if QK_K == 256
363
+ const uint16_t kmask1 = 0x3f3f;
364
+ const uint16_t kmask2 = 0x0f0f;
365
+ const uint16_t kmask3 = 0xc0c0;
366
+
367
+ const int tid = threadIdx.x/2; // 0...15
368
+ const int ix = threadIdx.x%2;
369
+
370
+ const int il = tid/4; // 0...3
371
+ const int ir = tid - 4*il;// 0...3
372
+ const int n = 2;
373
+
374
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
375
+ const int in = il%2;
376
+
377
+ const int l0 = n*(2*ir + in);
378
+ const int q_offset = 32*im + l0;
379
+ const int y_offset = 64*im + l0;
380
+
381
+ const uint8_t hm1 = 1 << (2*im);
382
+ const uint8_t hm2 = hm1 << 4;
383
+
384
+ uint16_t aux[4];
385
+ const uint8_t * sc = (const uint8_t *)aux;
386
+
387
+ uint16_t q16[8];
388
+ const uint8_t * q4 = (const uint8_t *)q16;
389
+
390
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
391
+
392
+ const uint8_t * ql1 = x[i].qs + q_offset;
393
+ const uint8_t * qh = x[i].qh + l0;
394
+ const float * y1 = yy + i*QK_K + y_offset;
395
+ const float * y2 = y1 + 128;
396
+
397
+ const float dall = __low2half(x[i].dm);
398
+ const float dmin = __high2half(x[i].dm);
399
+
400
+ const uint16_t * a = (const uint16_t *)x[i].scales;
401
+ aux[0] = a[im+0] & kmask1;
402
+ aux[1] = a[im+2] & kmask1;
403
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
404
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
405
+
406
+ float4 sum = {0.f, 0.f, 0.f, 0.f};
407
+ float smin = 0;
408
+ const uint16_t * q1 = (const uint16_t *)ql1;
409
+ const uint16_t * q2 = q1 + 32;
410
+ q16[0] = q1[0] & 0x0f0f;
411
+ q16[1] = q1[8] & 0x0f0f;
412
+ q16[2] = (q1[0] >> 4) & 0x0f0f;
413
+ q16[3] = (q1[8] >> 4) & 0x0f0f;
414
+ q16[4] = q2[0] & 0x0f0f;
415
+ q16[5] = q2[8] & 0x0f0f;
416
+ q16[6] = (q2[0] >> 4) & 0x0f0f;
417
+ q16[7] = (q2[8] >> 4) & 0x0f0f;
418
+ for (int l = 0; l < n; ++l) {
419
+ sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
420
+ + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
421
+ sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
422
+ + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
423
+ sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
424
+ + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
425
+ sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
426
+ + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
427
+ smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
428
+ + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
429
+ }
430
+ tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
431
+ }
432
+
433
+ #else
434
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
435
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
436
+ const int step = tid * K_QUANTS_PER_ITERATION;
437
+ const int im = step/8;
438
+ const int in = step%8;
439
+
440
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
441
+ const uint8_t * q = x[i].qs + step;
442
+ const int8_t * s = x[i].scales;
443
+ const float * y = yy + i*QK_K + step;
444
+ const float d = x[i].d;
445
+ float sum = 0.f;
446
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
447
+ const uint8_t h = x[i].qh[in+j] >> im;
448
+ sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
449
+ + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
450
+ + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
451
+ + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
452
+ }
453
+ tmp += sum;
454
+ }
455
+ #endif
456
+
457
+ // sum up partial sums and write back result
458
+ tmp = warp_reduce_sum(tmp);
459
+
460
+ if (threadIdx.x == 0) {
461
+ dst[row] = tmp;
462
+ }
463
+ }
464
+
465
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
466
+
467
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
468
+
469
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
470
+ if (row > nrows) return;
471
+
472
+ const int num_blocks_per_row = ncols / QK_K;
473
+ const int ib0 = row*num_blocks_per_row;
474
+
475
+ const block_q6_K * x = (const block_q6_K *)vx + ib0;
476
+
477
+ #if QK_K == 256
478
+
479
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
480
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
481
+
482
+ const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
483
+
484
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
485
+ const int in = tid - step*im; // 0...15 or 0...7
486
+
487
+ #if K_QUANTS_PER_ITERATION == 1
488
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
489
+ const int is = 0;
490
+ #else
491
+ const int l0 = 4 * in; // 0, 4, 8, ..., 28
492
+ const int is = in / 4;
493
+ #endif
494
+ const int ql_offset = 64*im + l0;
495
+ const int qh_offset = 32*im + l0;
496
+ const int s_offset = 8*im + is;
497
+ const int y_offset = 128*im + l0;
498
+
499
+ float tmp = 0; // partial sum for thread in warp
500
+
501
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
502
+
503
+ const float * y = yy + i * QK_K + y_offset;
504
+ const uint8_t * ql = x[i].ql + ql_offset;
505
+ const uint8_t * qh = x[i].qh + qh_offset;
506
+ const int8_t * s = x[i].scales + s_offset;
507
+
508
+ const float d = x[i].d;
509
+
510
+ #if K_QUANTS_PER_ITERATION == 1
511
+ float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
512
+ + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
513
+ + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
514
+ + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
515
+ + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
516
+ + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
517
+ + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
518
+ +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
519
+ tmp += sum;
520
+ #else
521
+ float sum = 0;
522
+ for (int l = 0; l < 4; ++l) {
523
+ sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
524
+ + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
525
+ + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
526
+ + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
527
+ }
528
+ tmp += sum;
529
+ #endif
530
+
531
+ }
532
+
533
+ #else
534
+
535
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
536
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
537
+
538
+ const int step = tid * K_QUANTS_PER_ITERATION;
539
+
540
+ float tmp = 0; // partial sum for thread in warp
541
+
542
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
543
+
544
+ const float * y = yy + i * QK_K + step;
545
+ const uint8_t * ql = x[i].ql + step;
546
+ const uint8_t * qh = x[i].qh + step;
547
+ const int8_t * s = x[i].scales;
548
+
549
+ const float d = x[i+0].d;
550
+
551
+ float sum = 0;
552
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
553
+ sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
554
+ + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
555
+ + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
556
+ + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
557
+ }
558
+ tmp += sum;
559
+
560
+ }
561
+
562
+ #endif
563
+
564
+ // sum up partial sums and write back result
565
+ tmp = warp_reduce_sum(tmp);
566
+
567
+ if (tid == 0) {
568
+ dst[row] = tmp;
569
+ }
570
+ }
571
+
572
+ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
573
+ const half * x = (const half *) vx;
574
+
575
+ // automatic half -> float type cast if dfloat == float
576
+ v.x = x[ib + iqs + 0];
577
+ v.y = x[ib + iqs + 1];
578
+ }
579
+
580
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
581
+ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
582
+ // qk = quantized weights per x block
583
+ // qr = number of quantized weights per data value in x block
584
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
585
+
586
+ if (row >= nrows) {
587
+ return;
588
+ }
589
+
590
+ const int tid = threadIdx.x;
591
+
592
+ const int iter_stride = 2*GGML_CUDA_DMMV_X;
593
+ const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
594
+ const int y_offset = qr == 1 ? 1 : qk/2;
595
+
596
+ // partial sum for each thread
597
+ #ifdef GGML_CUDA_F16
598
+ half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
599
+ #else
600
+ float tmp = 0.0f;
601
+ #endif // GGML_CUDA_F16
602
+
603
+ for (int i = 0; i < ncols; i += iter_stride) {
604
+ const int col = i + vals_per_iter*tid;
605
+ const int ib = (row*ncols + col)/qk; // x block index
606
+ const int iqs = (col%qk)/qr; // x quant index
607
+ const int iybs = col - col%qk; // y block start index
608
+
609
+ // processing >2 values per i iter is faster for fast GPUs
610
+ #pragma unroll
611
+ for (int j = 0; j < vals_per_iter; j += 2) {
612
+ // process 2 vals per j iter
613
+
614
+ // dequantize
615
+ // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
616
+ dfloat2 v;
617
+ dequantize_kernel(vx, ib, iqs + j/qr, v);
618
+
619
+ // matrix multiplication
620
+ // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
621
+ #ifdef GGML_CUDA_F16
622
+ tmp += __hmul2(v, {
623
+ y[iybs + iqs + j/qr + 0],
624
+ y[iybs + iqs + j/qr + y_offset]
625
+ });
626
+ #else
627
+ tmp += v.x * y[iybs + iqs + j/qr + 0];
628
+ tmp += v.y * y[iybs + iqs + j/qr + y_offset];
629
+ #endif // GGML_CUDA_F16
630
+ }
631
+ }
632
+
633
+ // sum up partial sums and write back result
634
+ tmp = warp_reduce_sum(tmp);
635
+
636
+ if (tid == 0) {
637
+ #ifdef GGML_CUDA_F16
638
+ dst[row] = tmp.x + tmp.y;
639
+ #else
640
+ dst[row] = tmp;
641
+ #endif // GGML_CUDA_F16
642
+ }
643
+ }
644
+
645
+ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
646
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
647
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
648
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
649
+ const dim3 block_nums(block_num_y, 1, 1);
650
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
651
+ dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
652
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
653
+ }
654
+
655
+ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
656
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
657
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
658
+ const dim3 block_nums(block_num_y, 1, 1);
659
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
660
+ dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
661
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
662
+ }
663
+
664
+ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
665
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
666
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
667
+ const dim3 block_nums(block_num_y, 1, 1);
668
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
669
+ dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
670
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
671
+ }
672
+
673
+ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
674
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
675
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
676
+ const dim3 block_nums(block_num_y, 1, 1);
677
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
678
+ dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
679
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
680
+ }
681
+
682
+ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
683
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
684
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
685
+ const dim3 block_nums(block_num_y, 1, 1);
686
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
687
+ dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
688
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
689
+ }
690
+
691
+ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
692
+ GGML_ASSERT(ncols % QK_K == 0);
693
+ const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
694
+ const int block_num_y = (nrows + ny - 1) / ny;
695
+ const dim3 block_nums(block_num_y, 1, 1);
696
+ const dim3 block_dims(32, ny, 1);
697
+ dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
698
+ }
699
+
700
+ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
701
+ GGML_ASSERT(ncols % QK_K == 0);
702
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
703
+ const int block_num_y = (nrows + ny - 1) / ny;
704
+ const dim3 block_nums(block_num_y, 1, 1);
705
+ const dim3 block_dims(32, ny, 1);
706
+ dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
707
+ }
708
+
709
+ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
710
+ GGML_ASSERT(ncols % QK_K == 0);
711
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
712
+ const int block_num_y = (nrows + ny - 1) / ny;
713
+ const dim3 block_nums(block_num_y, 1, 1);
714
+ const dim3 block_dims(32, ny, 1);
715
+ dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
716
+ }
717
+
718
+ static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
719
+ GGML_ASSERT(ncols % QK_K == 0);
720
+ const dim3 block_dims(32, 1, 1);
721
+ dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
722
+ }
723
+
724
+ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
725
+ GGML_ASSERT(ncols % QK_K == 0);
726
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
727
+ const int block_num_y = (nrows + ny - 1) / ny;
728
+ const dim3 block_nums(block_num_y, 1, 1);
729
+ const dim3 block_dims(32, ny, 1);
730
+ dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
731
+ }
732
+
733
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
734
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
735
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
736
+ const dim3 block_nums(block_num_y, 1, 1);
737
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
738
+ dequantize_mul_mat_vec<1, 1, convert_f16>
739
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
740
+ }
741
+
742
+ void ggml_cuda_op_dequantize_mul_mat_vec(
743
+ ggml_backend_cuda_context & ctx,
744
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
745
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
746
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
747
+ GGML_UNUSED(ctx);
748
+ const int64_t ne00 = src0->ne[0];
749
+ const int64_t row_diff = row_high - row_low;
750
+
751
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
752
+
753
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
754
+ #ifdef GGML_CUDA_F16
755
+ ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
756
+ half * src1_dfloat = nullptr; // dfloat == half
757
+
758
+ bool src1_convert_f16 =
759
+ src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
760
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
761
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
762
+
763
+ if (src1_convert_f16) {
764
+ src1_dfloat = src1_dfloat_a.alloc(ne00);
765
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
766
+ GGML_ASSERT(to_fp16_cuda != nullptr);
767
+ to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
768
+ }
769
+ #else
770
+ const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
771
+ #endif // GGML_CUDA_F16
772
+
773
+ switch (src0->type) {
774
+ case GGML_TYPE_Q4_0:
775
+ dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
776
+ break;
777
+ case GGML_TYPE_Q4_1:
778
+ dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
779
+ break;
780
+ case GGML_TYPE_Q5_0:
781
+ dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
782
+ break;
783
+ case GGML_TYPE_Q5_1:
784
+ dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
785
+ break;
786
+ case GGML_TYPE_Q8_0:
787
+ dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
788
+ break;
789
+ case GGML_TYPE_Q2_K:
790
+ dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
791
+ break;
792
+ case GGML_TYPE_Q3_K:
793
+ dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
794
+ break;
795
+ case GGML_TYPE_Q4_K:
796
+ dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
797
+ break;
798
+ case GGML_TYPE_Q5_K:
799
+ dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
800
+ break;
801
+ case GGML_TYPE_Q6_K:
802
+ dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
803
+ break;
804
+ case GGML_TYPE_F16:
805
+ convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
806
+ break;
807
+ default:
808
+ GGML_ASSERT(false);
809
+ break;
810
+ }
811
+
812
+ GGML_UNUSED(src1);
813
+ GGML_UNUSED(dst);
814
+ GGML_UNUSED(src1_ddq_i);
815
+ GGML_UNUSED(src1_ncols);
816
+ GGML_UNUSED(src1_padded_row_size);
817
+ }
ggml-cuda/dmmv.cuh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ void ggml_cuda_op_dequantize_mul_mat_vec(
4
+ ggml_backend_cuda_context & ctx,
5
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
6
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7
+ const int64_t src1_padded_row_size, cudaStream_t stream);
ggml-cuda/getrows.cu ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "getrows.cuh"
2
+ #include "dequantize.cuh"
3
+
4
+ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
5
+ static __global__ void k_get_rows(
6
+ const void * src0, const int32_t * src1, dst_t * dst,
7
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
8
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
9
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
10
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
11
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
12
+
13
+ const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
14
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
15
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
16
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
17
+
18
+ if (i00 >= ne00) {
19
+ return;
20
+ }
21
+
22
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
23
+
24
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
25
+ const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
26
+
27
+ const int ib = i00/qk; // block index
28
+ const int iqs = (i00%qk)/qr; // quant index
29
+ const int iybs = i00 - i00%qk; // dst block start index
30
+ const int y_offset = qr == 1 ? 1 : qk/2;
31
+
32
+ // dequantize
33
+ dfloat2 v;
34
+ dequantize_kernel(src0_row, ib, iqs, v);
35
+
36
+ dst_row[iybs + iqs + 0] = v.x;
37
+ dst_row[iybs + iqs + y_offset] = v.y;
38
+ }
39
+
40
+ template<typename src0_t, typename dst_t>
41
+ static __global__ void k_get_rows_float(
42
+ const src0_t * src0, const int32_t * src1, dst_t * dst,
43
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
44
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
45
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
46
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
47
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
48
+
49
+ const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
50
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
51
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
52
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
53
+
54
+ if (i00 >= ne00) {
55
+ return;
56
+ }
57
+
58
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
59
+
60
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
61
+ const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
62
+
63
+ dst_row[i00] = src0_row[i00];
64
+ }
65
+
66
+ template<int qk, int qr, dequantize_kernel_t dq>
67
+ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
68
+ const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
69
+
70
+ GGML_TENSOR_BINARY_OP_LOCALS
71
+
72
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
73
+ const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
74
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
75
+
76
+ // strides in elements
77
+ //const size_t s0 = nb0 / ggml_element_size(dst);
78
+ const size_t s1 = nb1 / ggml_element_size(dst);
79
+ const size_t s2 = nb2 / ggml_element_size(dst);
80
+ const size_t s3 = nb3 / ggml_element_size(dst);
81
+
82
+ const size_t s10 = nb10 / ggml_element_size(src1);
83
+ const size_t s11 = nb11 / ggml_element_size(src1);
84
+ const size_t s12 = nb12 / ggml_element_size(src1);
85
+ //const size_t s13 = nb13 / ggml_element_size(src1);
86
+
87
+ GGML_ASSERT(ne00 % 2 == 0);
88
+
89
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
90
+ src0_dd, src1_dd, dst_dd,
91
+ ne00, /*ne01, ne02, ne03,*/
92
+ /*ne10, ne11,*/ ne12, /*ne13,*/
93
+ /* s0,*/ s1, s2, s3,
94
+ /* nb00,*/ nb01, nb02, nb03,
95
+ s10, s11, s12/*, s13*/);
96
+
97
+ GGML_UNUSED(dst);
98
+ }
99
+
100
+ template<typename src0_t>
101
+ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
102
+ const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
103
+
104
+ GGML_TENSOR_BINARY_OP_LOCALS
105
+
106
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
107
+ const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
108
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
109
+
110
+ // strides in elements
111
+ //const size_t s0 = nb0 / ggml_element_size(dst);
112
+ const size_t s1 = nb1 / ggml_element_size(dst);
113
+ const size_t s2 = nb2 / ggml_element_size(dst);
114
+ const size_t s3 = nb3 / ggml_element_size(dst);
115
+
116
+ const size_t s10 = nb10 / ggml_element_size(src1);
117
+ const size_t s11 = nb11 / ggml_element_size(src1);
118
+ const size_t s12 = nb12 / ggml_element_size(src1);
119
+ //const size_t s13 = nb13 / ggml_element_size(src1);
120
+
121
+ k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
122
+ src0_dd, src1_dd, dst_dd,
123
+ ne00, /*ne01, ne02, ne03,*/
124
+ /*ne10, ne11,*/ ne12, /*ne13,*/
125
+ /* s0,*/ s1, s2, s3,
126
+ /* nb00,*/ nb01, nb02, nb03,
127
+ s10, s11, s12/*, s13*/);
128
+
129
+ GGML_UNUSED(dst);
130
+ }
131
+
132
+ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
133
+ const ggml_tensor * src0 = dst->src[0];
134
+ const ggml_tensor * src1 = dst->src[1];
135
+ const float * src0_d = (const float *)src0->data;
136
+ const float * src1_d = (const float *)src1->data;
137
+ float * dst_d = (float *)dst->data;
138
+ cudaStream_t stream = ctx.stream();
139
+
140
+
141
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
142
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
143
+
144
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
145
+ GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
146
+ GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
147
+
148
+ const int32_t * src1_i32 = (const int32_t *) src1_d;
149
+
150
+ switch (src0->type) {
151
+ case GGML_TYPE_F16:
152
+ get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
153
+ break;
154
+ case GGML_TYPE_F32:
155
+ get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
156
+ break;
157
+ case GGML_TYPE_Q4_0:
158
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
159
+ break;
160
+ case GGML_TYPE_Q4_1:
161
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
162
+ break;
163
+ case GGML_TYPE_Q5_0:
164
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
165
+ break;
166
+ case GGML_TYPE_Q5_1:
167
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
168
+ break;
169
+ case GGML_TYPE_Q8_0:
170
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
171
+ break;
172
+ default:
173
+ // TODO: k-quants
174
+ fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
175
+ GGML_ASSERT(false);
176
+ break;
177
+ }
178
+ }
ggml-cuda/getrows.cuh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_GET_ROWS_BLOCK_SIZE 256
4
+
5
+ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml-cuda/im2col.cu ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "im2col.cuh"
2
+
3
+ template <typename T>
4
+ static __global__ void im2col_kernel(
5
+ const float * x, T * dst, int64_t batch_offset,
6
+ int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
7
+ int s0, int s1, int p0, int p1, int d0, int d1) {
8
+ const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
9
+ if (i >= pelements) {
10
+ return;
11
+ }
12
+
13
+ const int64_t ksize = OW * (KH > 1 ? KW : 1);
14
+ const int64_t kx = i / ksize;
15
+ const int64_t kd = kx * ksize;
16
+ const int64_t ky = (i - kd) / OW;
17
+ const int64_t ix = i % OW;
18
+
19
+ const int64_t oh = blockIdx.y;
20
+ const int64_t batch = blockIdx.z / IC;
21
+ const int64_t ic = blockIdx.z % IC;
22
+
23
+ const int64_t iiw = ix * s0 + kx * d0 - p0;
24
+ const int64_t iih = oh * s1 + ky * d1 - p1;
25
+
26
+ const int64_t offset_dst =
27
+ ((batch * OH + oh) * OW + ix) * CHW +
28
+ (ic * (KW * KH) + ky * KW + kx);
29
+
30
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
31
+ dst[offset_dst] = 0.0f;
32
+ } else {
33
+ const int64_t offset_src = ic * offset_delta + batch * batch_offset;
34
+ dst[offset_dst] = x[offset_src + iih * IW + iiw];
35
+ }
36
+ }
37
+
38
+ template <typename T>
39
+ static void im2col_cuda(const float * x, T* dst,
40
+ int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
41
+ int64_t batch, int64_t batch_offset, int64_t offset_delta,
42
+ int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
43
+ const int parallel_elements = OW * KW * KH;
44
+ const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
45
+ dim3 block_nums(num_blocks, OH, batch * IC);
46
+ im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
47
+ }
48
+
49
+ static void im2col_cuda_f16(const float * x, half * dst,
50
+ int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
51
+ int64_t batch, int64_t batch_offset, int64_t offset_delta,
52
+ int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
53
+
54
+ im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
55
+ }
56
+
57
+ static void im2col_cuda_f32(const float * x, float * dst,
58
+ int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
59
+ int64_t batch, int64_t batch_offset, int64_t offset_delta,
60
+ int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
61
+
62
+ im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
63
+ }
64
+
65
+ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
66
+ const ggml_tensor * src0 = dst->src[0];
67
+ const ggml_tensor * src1 = dst->src[1];
68
+ const float * src1_d = (const float *)src1->data;
69
+ float * dst_d = (float *)dst->data;
70
+ cudaStream_t stream = ctx.stream();
71
+
72
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
73
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
74
+ GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
75
+
76
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
77
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
78
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
79
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
80
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
81
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
82
+
83
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
84
+
85
+ const int64_t IC = src1->ne[is_2D ? 2 : 1];
86
+ const int64_t IH = is_2D ? src1->ne[1] : 1;
87
+ const int64_t IW = src1->ne[0];
88
+
89
+ const int64_t KH = is_2D ? src0->ne[1] : 1;
90
+ const int64_t KW = src0->ne[0];
91
+
92
+ const int64_t OH = is_2D ? dst->ne[2] : 1;
93
+ const int64_t OW = dst->ne[1];
94
+
95
+ const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
96
+ const int64_t batch = src1->ne[3];
97
+ const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
98
+
99
+ if(dst->type == GGML_TYPE_F16) {
100
+ im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
101
+ } else {
102
+ im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
103
+ }
104
+ }