Spaces:
Running
Running
cuda : fix dequantize kernel names (llama/4938)
Browse files- ggml-cuda.cu +6 -6
ggml-cuda.cu
CHANGED
|
@@ -6309,14 +6309,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
| 6309 |
}
|
| 6310 |
|
| 6311 |
template<typename dst_t>
|
| 6312 |
-
static void
|
| 6313 |
const int nb32 = k / 32;
|
| 6314 |
const int nb = (k + 255) / 256;
|
| 6315 |
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 6316 |
}
|
| 6317 |
|
| 6318 |
template<typename dst_t>
|
| 6319 |
-
static void
|
| 6320 |
const int nb32 = k / 32;
|
| 6321 |
const int nb = (k + 255) / 256;
|
| 6322 |
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
|
@@ -6370,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
| 6370 |
int id;
|
| 6371 |
switch (type) {
|
| 6372 |
case GGML_TYPE_Q4_0:
|
| 6373 |
-
return
|
| 6374 |
case GGML_TYPE_Q4_1:
|
| 6375 |
-
return
|
| 6376 |
case GGML_TYPE_Q5_0:
|
| 6377 |
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 6378 |
case GGML_TYPE_Q5_1:
|
|
@@ -6407,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
| 6407 |
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
| 6408 |
switch (type) {
|
| 6409 |
case GGML_TYPE_Q4_0:
|
| 6410 |
-
return
|
| 6411 |
case GGML_TYPE_Q4_1:
|
| 6412 |
-
return
|
| 6413 |
case GGML_TYPE_Q5_0:
|
| 6414 |
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 6415 |
case GGML_TYPE_Q5_1:
|
|
|
|
| 6309 |
}
|
| 6310 |
|
| 6311 |
template<typename dst_t>
|
| 6312 |
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 6313 |
const int nb32 = k / 32;
|
| 6314 |
const int nb = (k + 255) / 256;
|
| 6315 |
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 6316 |
}
|
| 6317 |
|
| 6318 |
template<typename dst_t>
|
| 6319 |
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 6320 |
const int nb32 = k / 32;
|
| 6321 |
const int nb = (k + 255) / 256;
|
| 6322 |
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
|
|
|
| 6370 |
int id;
|
| 6371 |
switch (type) {
|
| 6372 |
case GGML_TYPE_Q4_0:
|
| 6373 |
+
return dequantize_row_q4_0_cuda;
|
| 6374 |
case GGML_TYPE_Q4_1:
|
| 6375 |
+
return dequantize_row_q4_1_cuda;
|
| 6376 |
case GGML_TYPE_Q5_0:
|
| 6377 |
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 6378 |
case GGML_TYPE_Q5_1:
|
|
|
|
| 6407 |
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
| 6408 |
switch (type) {
|
| 6409 |
case GGML_TYPE_Q4_0:
|
| 6410 |
+
return dequantize_row_q4_0_cuda;
|
| 6411 |
case GGML_TYPE_Q4_1:
|
| 6412 |
+
return dequantize_row_q4_1_cuda;
|
| 6413 |
case GGML_TYPE_Q5_0:
|
| 6414 |
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 6415 |
case GGML_TYPE_Q5_1:
|