Spaces:
Running
llama : add Command R Plus support (llama/6491)
Browse files* Add Command R Plus GGUF
* Add Command R Plus GGUF
* Loading works up to LayerNorm2D
* Export new tensors in 1D so they are not quantized.
* Fix embedding layer based on Noeda's example
* Whitespace
* Add line
* Fix unexpected tokens on MPS. Re-add F16 fix. ((Noeda)
* dranger003: Fix block index overflow in CUDA dequantizing.
* Reverted blocked multiplication code as it still has issues and could affect other Llama arches
* export norms as f32
* fix overflow issues during quant and other cleanup
* Type convention
Co-authored-by: Georgi Gerganov <[email protected]>
* dranger003: Fix more int overflow during quant.
---------
Co-authored-by: S <[email protected]>
Co-authored-by: S <[email protected]>
Co-authored-by: slaren <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>
- ggml-cuda.cu +3 -3
- ggml-cuda/common.cuh +1 -1
- ggml-cuda/convert.cu +37 -37
- ggml-cuda/convert.cuh +1 -1
- ggml-cuda/dequantize.cuh +5 -5
- ggml-cuda/dmmv.cu +3 -3
- ggml-cuda/quantize.cu +8 -8
- ggml-cuda/quantize.cuh +1 -1
- ggml-quants.c +155 -155
- ggml-quants.h +82 -82
- ggml.c +8 -8
- ggml.h +7 -7
|
@@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
| 1225 |
|
| 1226 |
// the main device has a larger memory buffer to hold the results from all GPUs
|
| 1227 |
// ldc == nrows of the matrix that cuBLAS writes into
|
| 1228 |
-
|
| 1229 |
|
| 1230 |
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
| 1231 |
|
|
@@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
|
|
| 1377 |
const int64_t ne0 = dst->ne[0];
|
| 1378 |
const int64_t ne1 = dst->ne[1];
|
| 1379 |
|
| 1380 |
-
const
|
| 1381 |
-
const
|
| 1382 |
|
| 1383 |
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
| 1384 |
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
|
|
|
| 1225 |
|
| 1226 |
// the main device has a larger memory buffer to hold the results from all GPUs
|
| 1227 |
// ldc == nrows of the matrix that cuBLAS writes into
|
| 1228 |
+
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
| 1229 |
|
| 1230 |
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
| 1231 |
|
|
|
|
| 1377 |
const int64_t ne0 = dst->ne[0];
|
| 1378 |
const int64_t ne1 = dst->ne[1];
|
| 1379 |
|
| 1380 |
+
const int64_t nb2 = dst->nb[2];
|
| 1381 |
+
const int64_t nb3 = dst->nb[3];
|
| 1382 |
|
| 1383 |
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
| 1384 |
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
|
@@ -394,7 +394,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
| 394 |
// TODO: move to ggml-common.h
|
| 395 |
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
| 396 |
|
| 397 |
-
typedef void (*dequantize_kernel_t)(const void * vx, const
|
| 398 |
|
| 399 |
|
| 400 |
//////////////////////
|
|
|
|
| 394 |
// TODO: move to ggml-common.h
|
| 395 |
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
| 396 |
|
| 397 |
+
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
| 398 |
|
| 399 |
|
| 400 |
//////////////////////
|
|
@@ -4,14 +4,14 @@
|
|
| 4 |
#define CUDA_Q8_0_NE_ALIGN 2048
|
| 5 |
|
| 6 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 7 |
-
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const
|
| 8 |
-
const
|
| 9 |
|
| 10 |
if (i >= k) {
|
| 11 |
return;
|
| 12 |
}
|
| 13 |
|
| 14 |
-
const
|
| 15 |
const int iqs = (i%qk)/qr; // quant index
|
| 16 |
const int iybs = i - i%qk; // y block start index
|
| 17 |
const int y_offset = qr == 1 ? 1 : qk/2;
|
|
@@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
|
| 25 |
}
|
| 26 |
|
| 27 |
template <bool need_check>
|
| 28 |
-
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const
|
| 29 |
#if __CUDA_ARCH__ >= CC_PASCAL
|
| 30 |
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
| 31 |
|
|
@@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
|
| 68 |
template<typename dst_t>
|
| 69 |
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
| 70 |
|
| 71 |
-
const
|
| 72 |
|
| 73 |
// assume 32 threads
|
| 74 |
const int tid = threadIdx.x;
|
| 75 |
const int il = tid/8;
|
| 76 |
const int ir = tid%8;
|
| 77 |
-
const
|
| 78 |
if (ib >= nb32) {
|
| 79 |
return;
|
| 80 |
}
|
|
@@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
|
|
| 96 |
template<typename dst_t>
|
| 97 |
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
| 98 |
|
| 99 |
-
const
|
| 100 |
|
| 101 |
// assume 32 threads
|
| 102 |
const int tid = threadIdx.x;
|
| 103 |
const int il = tid/8;
|
| 104 |
const int ir = tid%8;
|
| 105 |
-
const
|
| 106 |
if (ib >= nb32) {
|
| 107 |
return;
|
| 108 |
}
|
|
@@ -313,14 +313,14 @@ template<typename dst_t>
|
|
| 313 |
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 314 |
const block_q6_K * x = (const block_q6_K *) vx;
|
| 315 |
|
| 316 |
-
const
|
| 317 |
#if QK_K == 256
|
| 318 |
|
| 319 |
// assume 64 threads - this is very slightly better than the one below
|
| 320 |
-
const
|
| 321 |
-
const
|
| 322 |
-
const
|
| 323 |
-
const
|
| 324 |
|
| 325 |
dst_t * y = yy + i*QK_K + 128*ip + il;
|
| 326 |
|
|
@@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|
| 337 |
#else
|
| 338 |
|
| 339 |
// assume 32 threads
|
| 340 |
-
const
|
| 341 |
-
const
|
| 342 |
-
const
|
| 343 |
|
| 344 |
dst_t * y = yy + i*QK_K + 16*ip + il;
|
| 345 |
|
|
@@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
|
| 571 |
#endif
|
| 572 |
|
| 573 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 574 |
-
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const
|
| 575 |
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
| 576 |
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 577 |
}
|
| 578 |
|
| 579 |
-
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const
|
| 580 |
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
| 581 |
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
| 582 |
const bool need_check = false;
|
|
@@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
|
| 588 |
}
|
| 589 |
|
| 590 |
template<typename dst_t>
|
| 591 |
-
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const
|
| 592 |
const int nb = k / QK_K;
|
| 593 |
#if QK_K == 256
|
| 594 |
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
@@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
| 598 |
}
|
| 599 |
|
| 600 |
template<typename dst_t>
|
| 601 |
-
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const
|
| 602 |
const int nb = k / QK_K;
|
| 603 |
#if QK_K == 256
|
| 604 |
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
@@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
| 608 |
}
|
| 609 |
|
| 610 |
template<typename dst_t>
|
| 611 |
-
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const
|
| 612 |
const int nb32 = k / 32;
|
| 613 |
const int nb = (k + 255) / 256;
|
| 614 |
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 615 |
}
|
| 616 |
|
| 617 |
template<typename dst_t>
|
| 618 |
-
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const
|
| 619 |
const int nb32 = k / 32;
|
| 620 |
const int nb = (k + 255) / 256;
|
| 621 |
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 622 |
}
|
| 623 |
|
| 624 |
template<typename dst_t>
|
| 625 |
-
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const
|
| 626 |
const int nb = k / QK_K;
|
| 627 |
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 628 |
}
|
| 629 |
|
| 630 |
template<typename dst_t>
|
| 631 |
-
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const
|
| 632 |
const int nb = k / QK_K;
|
| 633 |
#if QK_K == 256
|
| 634 |
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
@@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
| 638 |
}
|
| 639 |
|
| 640 |
template<typename dst_t>
|
| 641 |
-
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const
|
| 642 |
const int nb = k / QK_K;
|
| 643 |
#if QK_K == 256
|
| 644 |
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
@@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
| 648 |
}
|
| 649 |
|
| 650 |
template<typename dst_t>
|
| 651 |
-
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const
|
| 652 |
const int nb = k / QK_K;
|
| 653 |
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
| 654 |
}
|
| 655 |
|
| 656 |
template<typename dst_t>
|
| 657 |
-
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const
|
| 658 |
const int nb = k / QK_K;
|
| 659 |
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
| 660 |
}
|
| 661 |
|
| 662 |
template<typename dst_t>
|
| 663 |
-
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const
|
| 664 |
const int nb = k / QK_K;
|
| 665 |
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 666 |
}
|
| 667 |
|
| 668 |
template<typename dst_t>
|
| 669 |
-
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const
|
| 670 |
const int nb = k / QK_K;
|
| 671 |
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
| 672 |
}
|
| 673 |
|
| 674 |
template<typename dst_t>
|
| 675 |
-
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const
|
| 676 |
const int nb = k / QK_K;
|
| 677 |
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 678 |
}
|
| 679 |
|
| 680 |
template<typename dst_t>
|
| 681 |
-
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const
|
| 682 |
const int nb = k / QK_K;
|
| 683 |
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 684 |
}
|
| 685 |
|
| 686 |
template<typename dst_t>
|
| 687 |
-
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const
|
| 688 |
const int nb = (k + QK_K - 1) / QK_K;
|
| 689 |
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
| 690 |
}
|
| 691 |
|
| 692 |
template<typename dst_t>
|
| 693 |
-
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const
|
| 694 |
const int nb = k / QK_K;
|
| 695 |
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
| 696 |
}
|
| 697 |
|
| 698 |
template<typename dst_t>
|
| 699 |
-
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const
|
| 700 |
const int nb = (k + QK_K - 1) / QK_K;
|
| 701 |
#if QK_K == 64
|
| 702 |
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
|
@@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
|
|
| 706 |
}
|
| 707 |
|
| 708 |
template <typename src_t, typename dst_t>
|
| 709 |
-
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const
|
| 710 |
-
const
|
| 711 |
|
| 712 |
if (i >= k) {
|
| 713 |
return;
|
|
@@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
|
| 719 |
}
|
| 720 |
|
| 721 |
template <typename src_t, typename dst_t>
|
| 722 |
-
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const
|
| 723 |
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 724 |
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 725 |
}
|
|
|
|
| 4 |
#define CUDA_Q8_0_NE_ALIGN 2048
|
| 5 |
|
| 6 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 7 |
+
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
| 8 |
+
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
| 9 |
|
| 10 |
if (i >= k) {
|
| 11 |
return;
|
| 12 |
}
|
| 13 |
|
| 14 |
+
const int64_t ib = i/qk; // block index
|
| 15 |
const int iqs = (i%qk)/qr; // quant index
|
| 16 |
const int iybs = i - i%qk; // y block start index
|
| 17 |
const int y_offset = qr == 1 ? 1 : qk/2;
|
|
|
|
| 25 |
}
|
| 26 |
|
| 27 |
template <bool need_check>
|
| 28 |
+
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
| 29 |
#if __CUDA_ARCH__ >= CC_PASCAL
|
| 30 |
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
| 31 |
|
|
|
|
| 68 |
template<typename dst_t>
|
| 69 |
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
| 70 |
|
| 71 |
+
const int64_t i = blockIdx.x;
|
| 72 |
|
| 73 |
// assume 32 threads
|
| 74 |
const int tid = threadIdx.x;
|
| 75 |
const int il = tid/8;
|
| 76 |
const int ir = tid%8;
|
| 77 |
+
const int64_t ib = 8*i + ir;
|
| 78 |
if (ib >= nb32) {
|
| 79 |
return;
|
| 80 |
}
|
|
|
|
| 96 |
template<typename dst_t>
|
| 97 |
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
| 98 |
|
| 99 |
+
const int64_t i = blockIdx.x;
|
| 100 |
|
| 101 |
// assume 32 threads
|
| 102 |
const int tid = threadIdx.x;
|
| 103 |
const int il = tid/8;
|
| 104 |
const int ir = tid%8;
|
| 105 |
+
const int64_t ib = 8*i + ir;
|
| 106 |
if (ib >= nb32) {
|
| 107 |
return;
|
| 108 |
}
|
|
|
|
| 313 |
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 314 |
const block_q6_K * x = (const block_q6_K *) vx;
|
| 315 |
|
| 316 |
+
const int64_t i = blockIdx.x;
|
| 317 |
#if QK_K == 256
|
| 318 |
|
| 319 |
// assume 64 threads - this is very slightly better than the one below
|
| 320 |
+
const int64_t tid = threadIdx.x;
|
| 321 |
+
const int64_t ip = tid/32; // ip is 0 or 1
|
| 322 |
+
const int64_t il = tid - 32*ip; // 0...32
|
| 323 |
+
const int64_t is = 8*ip + il/16;
|
| 324 |
|
| 325 |
dst_t * y = yy + i*QK_K + 128*ip + il;
|
| 326 |
|
|
|
|
| 337 |
#else
|
| 338 |
|
| 339 |
// assume 32 threads
|
| 340 |
+
const int64_t tid = threadIdx.x;
|
| 341 |
+
const int64_t ip = tid/16; // 0 or 1
|
| 342 |
+
const int64_t il = tid - 16*ip; // 0...15
|
| 343 |
|
| 344 |
dst_t * y = yy + i*QK_K + 16*ip + il;
|
| 345 |
|
|
|
|
| 571 |
#endif
|
| 572 |
|
| 573 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 574 |
+
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
| 575 |
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
| 576 |
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 577 |
}
|
| 578 |
|
| 579 |
+
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
| 580 |
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
| 581 |
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
| 582 |
const bool need_check = false;
|
|
|
|
| 588 |
}
|
| 589 |
|
| 590 |
template<typename dst_t>
|
| 591 |
+
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 592 |
const int nb = k / QK_K;
|
| 593 |
#if QK_K == 256
|
| 594 |
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
|
|
| 598 |
}
|
| 599 |
|
| 600 |
template<typename dst_t>
|
| 601 |
+
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 602 |
const int nb = k / QK_K;
|
| 603 |
#if QK_K == 256
|
| 604 |
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
|
|
| 608 |
}
|
| 609 |
|
| 610 |
template<typename dst_t>
|
| 611 |
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 612 |
const int nb32 = k / 32;
|
| 613 |
const int nb = (k + 255) / 256;
|
| 614 |
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 615 |
}
|
| 616 |
|
| 617 |
template<typename dst_t>
|
| 618 |
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 619 |
const int nb32 = k / 32;
|
| 620 |
const int nb = (k + 255) / 256;
|
| 621 |
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 622 |
}
|
| 623 |
|
| 624 |
template<typename dst_t>
|
| 625 |
+
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 626 |
const int nb = k / QK_K;
|
| 627 |
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 628 |
}
|
| 629 |
|
| 630 |
template<typename dst_t>
|
| 631 |
+
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 632 |
const int nb = k / QK_K;
|
| 633 |
#if QK_K == 256
|
| 634 |
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
|
|
| 638 |
}
|
| 639 |
|
| 640 |
template<typename dst_t>
|
| 641 |
+
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 642 |
const int nb = k / QK_K;
|
| 643 |
#if QK_K == 256
|
| 644 |
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
|
|
| 648 |
}
|
| 649 |
|
| 650 |
template<typename dst_t>
|
| 651 |
+
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 652 |
const int nb = k / QK_K;
|
| 653 |
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
| 654 |
}
|
| 655 |
|
| 656 |
template<typename dst_t>
|
| 657 |
+
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 658 |
const int nb = k / QK_K;
|
| 659 |
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
| 660 |
}
|
| 661 |
|
| 662 |
template<typename dst_t>
|
| 663 |
+
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 664 |
const int nb = k / QK_K;
|
| 665 |
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 666 |
}
|
| 667 |
|
| 668 |
template<typename dst_t>
|
| 669 |
+
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 670 |
const int nb = k / QK_K;
|
| 671 |
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
| 672 |
}
|
| 673 |
|
| 674 |
template<typename dst_t>
|
| 675 |
+
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 676 |
const int nb = k / QK_K;
|
| 677 |
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 678 |
}
|
| 679 |
|
| 680 |
template<typename dst_t>
|
| 681 |
+
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 682 |
const int nb = k / QK_K;
|
| 683 |
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 684 |
}
|
| 685 |
|
| 686 |
template<typename dst_t>
|
| 687 |
+
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 688 |
const int nb = (k + QK_K - 1) / QK_K;
|
| 689 |
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
| 690 |
}
|
| 691 |
|
| 692 |
template<typename dst_t>
|
| 693 |
+
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 694 |
const int nb = k / QK_K;
|
| 695 |
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
| 696 |
}
|
| 697 |
|
| 698 |
template<typename dst_t>
|
| 699 |
+
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 700 |
const int nb = (k + QK_K - 1) / QK_K;
|
| 701 |
#if QK_K == 64
|
| 702 |
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
|
|
|
| 706 |
}
|
| 707 |
|
| 708 |
template <typename src_t, typename dst_t>
|
| 709 |
+
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
| 710 |
+
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
| 711 |
|
| 712 |
if (i >= k) {
|
| 713 |
return;
|
|
|
|
| 719 |
}
|
| 720 |
|
| 721 |
template <typename src_t, typename dst_t>
|
| 722 |
+
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
| 723 |
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 724 |
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 725 |
}
|
|
@@ -3,7 +3,7 @@
|
|
| 3 |
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
| 4 |
|
| 5 |
template<typename T>
|
| 6 |
-
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y,
|
| 7 |
|
| 8 |
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
| 9 |
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
|
|
|
| 3 |
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
| 4 |
|
| 5 |
template<typename T>
|
| 6 |
+
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
| 7 |
|
| 8 |
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
| 9 |
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
|
@@ -1,6 +1,6 @@
|
|
| 1 |
#include "common.cuh"
|
| 2 |
|
| 3 |
-
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const
|
| 4 |
const block_q4_0 * x = (const block_q4_0 *) vx;
|
| 5 |
|
| 6 |
const dfloat d = x[ib].d;
|
|
@@ -19,7 +19,7 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
| 19 |
#endif // GGML_CUDA_F16
|
| 20 |
}
|
| 21 |
|
| 22 |
-
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const
|
| 23 |
const block_q4_1 * x = (const block_q4_1 *) vx;
|
| 24 |
|
| 25 |
const dfloat d = __low2half(x[ib].dm);
|
|
@@ -39,7 +39,7 @@ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const in
|
|
| 39 |
#endif // GGML_CUDA_F16
|
| 40 |
}
|
| 41 |
|
| 42 |
-
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const
|
| 43 |
const block_q5_0 * x = (const block_q5_0 *) vx;
|
| 44 |
|
| 45 |
const dfloat d = x[ib].d;
|
|
@@ -62,7 +62,7 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
| 62 |
#endif // GGML_CUDA_F16
|
| 63 |
}
|
| 64 |
|
| 65 |
-
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const
|
| 66 |
const block_q5_1 * x = (const block_q5_1 *) vx;
|
| 67 |
|
| 68 |
const dfloat d = __low2half(x[ib].dm);
|
|
@@ -86,7 +86,7 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
|
| 86 |
#endif // GGML_CUDA_F16
|
| 87 |
}
|
| 88 |
|
| 89 |
-
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const
|
| 90 |
const block_q8_0 * x = (const block_q8_0 *) vx;
|
| 91 |
|
| 92 |
const dfloat d = x[ib].d;
|
|
|
|
| 1 |
#include "common.cuh"
|
| 2 |
|
| 3 |
+
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
| 4 |
const block_q4_0 * x = (const block_q4_0 *) vx;
|
| 5 |
|
| 6 |
const dfloat d = x[ib].d;
|
|
|
|
| 19 |
#endif // GGML_CUDA_F16
|
| 20 |
}
|
| 21 |
|
| 22 |
+
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
| 23 |
const block_q4_1 * x = (const block_q4_1 *) vx;
|
| 24 |
|
| 25 |
const dfloat d = __low2half(x[ib].dm);
|
|
|
|
| 39 |
#endif // GGML_CUDA_F16
|
| 40 |
}
|
| 41 |
|
| 42 |
+
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
| 43 |
const block_q5_0 * x = (const block_q5_0 *) vx;
|
| 44 |
|
| 45 |
const dfloat d = x[ib].d;
|
|
|
|
| 62 |
#endif // GGML_CUDA_F16
|
| 63 |
}
|
| 64 |
|
| 65 |
+
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
| 66 |
const block_q5_1 * x = (const block_q5_1 *) vx;
|
| 67 |
|
| 68 |
const dfloat d = __low2half(x[ib].dm);
|
|
|
|
| 86 |
#endif // GGML_CUDA_F16
|
| 87 |
}
|
| 88 |
|
| 89 |
+
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
| 90 |
const block_q8_0 * x = (const block_q8_0 *) vx;
|
| 91 |
|
| 92 |
const dfloat d = x[ib].d;
|
|
@@ -565,7 +565,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
| 565 |
}
|
| 566 |
}
|
| 567 |
|
| 568 |
-
static __device__ void convert_f16(const void * vx, const
|
| 569 |
const half * x = (const half *) vx;
|
| 570 |
|
| 571 |
// automatic half -> float type cast if dfloat == float
|
|
@@ -577,7 +577,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
| 577 |
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
| 578 |
// qk = quantized weights per x block
|
| 579 |
// qr = number of quantized weights per data value in x block
|
| 580 |
-
const
|
| 581 |
|
| 582 |
if (row >= nrows) {
|
| 583 |
return;
|
|
@@ -598,7 +598,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
| 598 |
|
| 599 |
for (int i = 0; i < ncols; i += iter_stride) {
|
| 600 |
const int col = i + vals_per_iter*tid;
|
| 601 |
-
const
|
| 602 |
const int iqs = (col%qk)/qr; // x quant index
|
| 603 |
const int iybs = col - col%qk; // y block start index
|
| 604 |
|
|
|
|
| 565 |
}
|
| 566 |
}
|
| 567 |
|
| 568 |
+
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
| 569 |
const half * x = (const half *) vx;
|
| 570 |
|
| 571 |
// automatic half -> float type cast if dfloat == float
|
|
|
|
| 577 |
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
| 578 |
// qk = quantized weights per x block
|
| 579 |
// qr = number of quantized weights per data value in x block
|
| 580 |
+
const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
|
| 581 |
|
| 582 |
if (row >= nrows) {
|
| 583 |
return;
|
|
|
|
| 598 |
|
| 599 |
for (int i = 0; i < ncols; i += iter_stride) {
|
| 600 |
const int col = i + vals_per_iter*tid;
|
| 601 |
+
const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
|
| 602 |
const int iqs = (col%qk)/qr; // x quant index
|
| 603 |
const int iybs = col - col%qk; // y block start index
|
| 604 |
|
|
@@ -1,20 +1,20 @@
|
|
| 1 |
#include "quantize.cuh"
|
| 2 |
|
| 3 |
-
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const
|
| 4 |
-
const
|
| 5 |
|
| 6 |
if (ix >= kx_padded) {
|
| 7 |
return;
|
| 8 |
}
|
| 9 |
|
| 10 |
-
const
|
| 11 |
|
| 12 |
-
const
|
| 13 |
|
| 14 |
block_q8_1 * y = (block_q8_1 *) vy;
|
| 15 |
|
| 16 |
-
const
|
| 17 |
-
const
|
| 18 |
|
| 19 |
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
| 20 |
float amax = fabsf(xi);
|
|
@@ -36,8 +36,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
| 36 |
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
| 37 |
}
|
| 38 |
|
| 39 |
-
void quantize_row_q8_1_cuda(const float * x, void * vy, const
|
| 40 |
-
const
|
| 41 |
const dim3 num_blocks(block_num_x, ky, 1);
|
| 42 |
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
| 43 |
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
|
|
|
| 1 |
#include "quantize.cuh"
|
| 2 |
|
| 3 |
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
|
| 4 |
+
const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
| 5 |
|
| 6 |
if (ix >= kx_padded) {
|
| 7 |
return;
|
| 8 |
}
|
| 9 |
|
| 10 |
+
const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
|
| 11 |
|
| 12 |
+
const int64_t i_padded = (int64_t)iy*kx_padded + ix;
|
| 13 |
|
| 14 |
block_q8_1 * y = (block_q8_1 *) vy;
|
| 15 |
|
| 16 |
+
const int64_t ib = i_padded / QK8_1; // block index
|
| 17 |
+
const int64_t iqs = i_padded % QK8_1; // quant index
|
| 18 |
|
| 19 |
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
| 20 |
float amax = fabsf(xi);
|
|
|
|
| 36 |
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
| 37 |
}
|
| 38 |
|
| 39 |
+
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
|
| 40 |
+
const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
| 41 |
const dim3 num_blocks(block_num_x, ky, 1);
|
| 42 |
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
| 43 |
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
|
@@ -2,4 +2,4 @@
|
|
| 2 |
|
| 3 |
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
| 4 |
|
| 5 |
-
void quantize_row_q8_1_cuda(const float * x, void * vy, const
|
|
|
|
| 2 |
|
| 3 |
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
| 4 |
|
| 5 |
+
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
|
|
@@ -544,7 +544,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
|
|
| 544 |
#endif
|
| 545 |
|
| 546 |
// reference implementation for deterministic creation of model files
|
| 547 |
-
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y,
|
| 548 |
static const int qk = QK4_0;
|
| 549 |
|
| 550 |
assert(k % qk == 0);
|
|
@@ -581,12 +581,12 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
|
|
| 581 |
}
|
| 582 |
}
|
| 583 |
|
| 584 |
-
void quantize_row_q4_0(const float * restrict x, void * restrict y,
|
| 585 |
quantize_row_q4_0_reference(x, y, k);
|
| 586 |
}
|
| 587 |
|
| 588 |
|
| 589 |
-
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y,
|
| 590 |
const int qk = QK4_1;
|
| 591 |
|
| 592 |
assert(k % qk == 0);
|
|
@@ -623,11 +623,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
|
|
| 623 |
}
|
| 624 |
}
|
| 625 |
|
| 626 |
-
void quantize_row_q4_1(const float * restrict x, void * restrict y,
|
| 627 |
quantize_row_q4_1_reference(x, y, k);
|
| 628 |
}
|
| 629 |
|
| 630 |
-
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y,
|
| 631 |
static const int qk = QK5_0;
|
| 632 |
|
| 633 |
assert(k % qk == 0);
|
|
@@ -671,11 +671,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
|
|
| 671 |
}
|
| 672 |
}
|
| 673 |
|
| 674 |
-
void quantize_row_q5_0(const float * restrict x, void * restrict y,
|
| 675 |
quantize_row_q5_0_reference(x, y, k);
|
| 676 |
}
|
| 677 |
|
| 678 |
-
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y,
|
| 679 |
const int qk = QK5_1;
|
| 680 |
|
| 681 |
assert(k % qk == 0);
|
|
@@ -719,12 +719,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
|
|
| 719 |
}
|
| 720 |
}
|
| 721 |
|
| 722 |
-
void quantize_row_q5_1(const float * restrict x, void * restrict y,
|
| 723 |
quantize_row_q5_1_reference(x, y, k);
|
| 724 |
}
|
| 725 |
|
| 726 |
// reference implementation for deterministic creation of model files
|
| 727 |
-
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y,
|
| 728 |
assert(k % QK8_0 == 0);
|
| 729 |
const int nb = k / QK8_0;
|
| 730 |
|
|
@@ -749,7 +749,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
|
|
| 749 |
}
|
| 750 |
}
|
| 751 |
|
| 752 |
-
void quantize_row_q8_0(const float * restrict x, void * restrict vy,
|
| 753 |
assert(QK8_0 == 32);
|
| 754 |
assert(k % QK8_0 == 0);
|
| 755 |
const int nb = k / QK8_0;
|
|
@@ -938,7 +938,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
|
|
| 938 |
}
|
| 939 |
|
| 940 |
// reference implementation for deterministic creation of model files
|
| 941 |
-
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y,
|
| 942 |
assert(QK8_1 == 32);
|
| 943 |
assert(k % QK8_1 == 0);
|
| 944 |
const int nb = k / QK8_1;
|
|
@@ -973,7 +973,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
|
|
| 973 |
}
|
| 974 |
}
|
| 975 |
|
| 976 |
-
void quantize_row_q8_1(const float * restrict x, void * restrict vy,
|
| 977 |
assert(k % QK8_1 == 0);
|
| 978 |
const int nb = k / QK8_1;
|
| 979 |
|
|
@@ -1192,7 +1192,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 1192 |
#endif
|
| 1193 |
}
|
| 1194 |
|
| 1195 |
-
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y,
|
| 1196 |
static const int qk = QK4_0;
|
| 1197 |
|
| 1198 |
assert(k % qk == 0);
|
|
@@ -1212,7 +1212,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
|
|
| 1212 |
}
|
| 1213 |
}
|
| 1214 |
|
| 1215 |
-
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y,
|
| 1216 |
static const int qk = QK4_1;
|
| 1217 |
|
| 1218 |
assert(k % qk == 0);
|
|
@@ -1233,7 +1233,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
|
|
| 1233 |
}
|
| 1234 |
}
|
| 1235 |
|
| 1236 |
-
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y,
|
| 1237 |
static const int qk = QK5_0;
|
| 1238 |
|
| 1239 |
assert(k % qk == 0);
|
|
@@ -1259,7 +1259,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
|
|
| 1259 |
}
|
| 1260 |
}
|
| 1261 |
|
| 1262 |
-
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y,
|
| 1263 |
static const int qk = QK5_1;
|
| 1264 |
|
| 1265 |
assert(k % qk == 0);
|
|
@@ -1286,7 +1286,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
|
|
| 1286 |
}
|
| 1287 |
}
|
| 1288 |
|
| 1289 |
-
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y,
|
| 1290 |
static const int qk = QK8_0;
|
| 1291 |
|
| 1292 |
assert(k % qk == 0);
|
|
@@ -1581,7 +1581,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|
| 1581 |
|
| 1582 |
//========================- 2-bit (de)-quantization
|
| 1583 |
|
| 1584 |
-
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y,
|
| 1585 |
assert(k % QK_K == 0);
|
| 1586 |
const int nb = k / QK_K;
|
| 1587 |
|
|
@@ -1658,7 +1658,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
|
| 1658 |
}
|
| 1659 |
}
|
| 1660 |
|
| 1661 |
-
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y,
|
| 1662 |
assert(k % QK_K == 0);
|
| 1663 |
const int nb = k / QK_K;
|
| 1664 |
|
|
@@ -1704,7 +1704,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
|
|
| 1704 |
}
|
| 1705 |
}
|
| 1706 |
|
| 1707 |
-
void quantize_row_q2_K(const float * restrict x, void * restrict vy,
|
| 1708 |
quantize_row_q2_K_reference(x, vy, k);
|
| 1709 |
}
|
| 1710 |
|
|
@@ -1960,14 +1960,14 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
| 1960 |
}
|
| 1961 |
}
|
| 1962 |
|
| 1963 |
-
size_t quantize_q2_K(const float * restrict src, void * restrict dst,
|
| 1964 |
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
| 1965 |
if (!quant_weights) {
|
| 1966 |
-
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
| 1967 |
}
|
| 1968 |
else {
|
| 1969 |
char * qrow = (char *)dst;
|
| 1970 |
-
for (
|
| 1971 |
quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
|
| 1972 |
src += n_per_row;
|
| 1973 |
qrow += row_size;
|
|
@@ -1978,7 +1978,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow,
|
|
| 1978 |
|
| 1979 |
//========================= 3-bit (de)-quantization
|
| 1980 |
|
| 1981 |
-
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y,
|
| 1982 |
assert(k % QK_K == 0);
|
| 1983 |
const int nb = k / QK_K;
|
| 1984 |
|
|
@@ -2092,7 +2092,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
|
|
| 2092 |
}
|
| 2093 |
|
| 2094 |
#if QK_K == 256
|
| 2095 |
-
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y,
|
| 2096 |
assert(k % QK_K == 0);
|
| 2097 |
const int nb = k / QK_K;
|
| 2098 |
|
|
@@ -2142,7 +2142,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
|
|
| 2142 |
}
|
| 2143 |
}
|
| 2144 |
#else
|
| 2145 |
-
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y,
|
| 2146 |
assert(k % QK_K == 0);
|
| 2147 |
assert(QK_K == 64);
|
| 2148 |
const int nb = k / QK_K;
|
|
@@ -2175,11 +2175,11 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
|
|
| 2175 |
}
|
| 2176 |
#endif
|
| 2177 |
|
| 2178 |
-
void quantize_row_q3_K(const float * restrict x, void * restrict vy,
|
| 2179 |
quantize_row_q3_K_reference(x, vy, k);
|
| 2180 |
}
|
| 2181 |
|
| 2182 |
-
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y,
|
| 2183 |
#if QK_K != 256
|
| 2184 |
(void)quant_weights;
|
| 2185 |
quantize_row_q3_K_reference(x, y, n_per_row);
|
|
@@ -2268,14 +2268,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
| 2268 |
#endif
|
| 2269 |
}
|
| 2270 |
|
| 2271 |
-
size_t quantize_q3_K(const float * restrict src, void * restrict dst,
|
| 2272 |
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
| 2273 |
if (!quant_weights) {
|
| 2274 |
-
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
| 2275 |
}
|
| 2276 |
else {
|
| 2277 |
char * qrow = (char *)dst;
|
| 2278 |
-
for (
|
| 2279 |
quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
|
| 2280 |
src += n_per_row;
|
| 2281 |
qrow += row_size;
|
|
@@ -2286,7 +2286,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow,
|
|
| 2286 |
|
| 2287 |
// ====================== 4-bit (de)-quantization
|
| 2288 |
|
| 2289 |
-
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y,
|
| 2290 |
assert(k % QK_K == 0);
|
| 2291 |
const int nb = k / QK_K;
|
| 2292 |
|
|
@@ -2393,7 +2393,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
| 2393 |
}
|
| 2394 |
}
|
| 2395 |
|
| 2396 |
-
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y,
|
| 2397 |
assert(k % QK_K == 0);
|
| 2398 |
const int nb = k / QK_K;
|
| 2399 |
|
|
@@ -2432,19 +2432,19 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
|
|
| 2432 |
}
|
| 2433 |
}
|
| 2434 |
|
| 2435 |
-
void quantize_row_q4_K(const float * restrict x, void * restrict vy,
|
| 2436 |
assert(k % QK_K == 0);
|
| 2437 |
block_q4_K * restrict y = vy;
|
| 2438 |
quantize_row_q4_K_reference(x, y, k);
|
| 2439 |
}
|
| 2440 |
|
| 2441 |
-
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y,
|
| 2442 |
#if QK_K != 256
|
| 2443 |
(void)quant_weights;
|
| 2444 |
quantize_row_q4_K_reference(x, y, n_per_row);
|
| 2445 |
#else
|
| 2446 |
assert(n_per_row % QK_K == 0);
|
| 2447 |
-
const
|
| 2448 |
|
| 2449 |
uint8_t L[QK_K];
|
| 2450 |
uint8_t Laux[32];
|
|
@@ -2516,14 +2516,14 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
| 2516 |
#endif
|
| 2517 |
}
|
| 2518 |
|
| 2519 |
-
size_t quantize_q4_K(const float * restrict src, void * restrict dst,
|
| 2520 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
| 2521 |
if (!quant_weights) {
|
| 2522 |
-
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
| 2523 |
}
|
| 2524 |
else {
|
| 2525 |
char * qrow = (char *)dst;
|
| 2526 |
-
for (
|
| 2527 |
quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
|
| 2528 |
src += n_per_row;
|
| 2529 |
qrow += row_size;
|
|
@@ -2534,9 +2534,9 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow,
|
|
| 2534 |
|
| 2535 |
// ====================== 5-bit (de)-quantization
|
| 2536 |
|
| 2537 |
-
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y,
|
| 2538 |
assert(k % QK_K == 0);
|
| 2539 |
-
const
|
| 2540 |
|
| 2541 |
#if QK_K == 256
|
| 2542 |
uint8_t L[QK_K];
|
|
@@ -2676,9 +2676,9 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
| 2676 |
}
|
| 2677 |
}
|
| 2678 |
|
| 2679 |
-
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y,
|
| 2680 |
assert(k % QK_K == 0);
|
| 2681 |
-
const
|
| 2682 |
|
| 2683 |
for (int i = 0; i < nb; i++) {
|
| 2684 |
|
|
@@ -2721,19 +2721,19 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
|
|
| 2721 |
}
|
| 2722 |
}
|
| 2723 |
|
| 2724 |
-
void quantize_row_q5_K(const float * restrict x, void * restrict vy,
|
| 2725 |
assert(k % QK_K == 0);
|
| 2726 |
block_q5_K * restrict y = vy;
|
| 2727 |
quantize_row_q5_K_reference(x, y, k);
|
| 2728 |
}
|
| 2729 |
|
| 2730 |
-
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y,
|
| 2731 |
#if QK_K != 256
|
| 2732 |
(void)quant_weights;
|
| 2733 |
quantize_row_q5_K_reference(x, y, n_per_row);
|
| 2734 |
#else
|
| 2735 |
assert(n_per_row % QK_K == 0);
|
| 2736 |
-
const
|
| 2737 |
|
| 2738 |
uint8_t L[QK_K];
|
| 2739 |
uint8_t Laux[32];
|
|
@@ -2825,14 +2825,14 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
| 2825 |
#endif
|
| 2826 |
}
|
| 2827 |
|
| 2828 |
-
size_t quantize_q5_K(const float * restrict src, void * restrict dst,
|
| 2829 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
| 2830 |
if (!quant_weights) {
|
| 2831 |
-
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
| 2832 |
}
|
| 2833 |
else {
|
| 2834 |
char * qrow = (char *)dst;
|
| 2835 |
-
for (
|
| 2836 |
quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
|
| 2837 |
src += n_per_row;
|
| 2838 |
qrow += row_size;
|
|
@@ -2843,9 +2843,9 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow,
|
|
| 2843 |
|
| 2844 |
// ====================== 6-bit (de)-quantization
|
| 2845 |
|
| 2846 |
-
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y,
|
| 2847 |
assert(k % QK_K == 0);
|
| 2848 |
-
const
|
| 2849 |
|
| 2850 |
int8_t L[QK_K];
|
| 2851 |
float scales[QK_K/16];
|
|
@@ -2925,9 +2925,9 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
| 2925 |
}
|
| 2926 |
}
|
| 2927 |
|
| 2928 |
-
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y,
|
| 2929 |
assert(k % QK_K == 0);
|
| 2930 |
-
const
|
| 2931 |
|
| 2932 |
for (int i = 0; i < nb; i++) {
|
| 2933 |
|
|
@@ -2972,19 +2972,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
|
| 2972 |
}
|
| 2973 |
}
|
| 2974 |
|
| 2975 |
-
void quantize_row_q6_K(const float * restrict x, void * restrict vy,
|
| 2976 |
assert(k % QK_K == 0);
|
| 2977 |
block_q6_K * restrict y = vy;
|
| 2978 |
quantize_row_q6_K_reference(x, y, k);
|
| 2979 |
}
|
| 2980 |
|
| 2981 |
-
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y,
|
| 2982 |
#if QK_K != 256
|
| 2983 |
(void)quant_weights;
|
| 2984 |
quantize_row_q6_K_reference(x, y, n_per_row);
|
| 2985 |
#else
|
| 2986 |
assert(n_per_row % QK_K == 0);
|
| 2987 |
-
const
|
| 2988 |
|
| 2989 |
int8_t L[QK_K];
|
| 2990 |
float scales[QK_K/16];
|
|
@@ -3067,14 +3067,14 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
| 3067 |
#endif
|
| 3068 |
}
|
| 3069 |
|
| 3070 |
-
size_t quantize_q6_K(const float * restrict src, void * restrict dst,
|
| 3071 |
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
| 3072 |
if (!quant_weights) {
|
| 3073 |
-
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
| 3074 |
}
|
| 3075 |
else {
|
| 3076 |
char * qrow = (char *)dst;
|
| 3077 |
-
for (
|
| 3078 |
quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
|
| 3079 |
src += n_per_row;
|
| 3080 |
qrow += row_size;
|
|
@@ -3083,7 +3083,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow,
|
|
| 3083 |
return nrow * row_size;
|
| 3084 |
}
|
| 3085 |
|
| 3086 |
-
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y,
|
| 3087 |
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
| 3088 |
|
| 3089 |
if (!quant_weights) {
|
|
@@ -3098,7 +3098,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
| 3098 |
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
| 3099 |
float sigma2 = sum_x2/n_per_row;
|
| 3100 |
|
| 3101 |
-
const
|
| 3102 |
for (int ib = 0; ib < nb; ++ib) {
|
| 3103 |
const float * xb = x + QK4_0 * ib;
|
| 3104 |
const float * qw = quant_weights + QK4_0 * ib;
|
|
@@ -3111,14 +3111,14 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
| 3111 |
}
|
| 3112 |
}
|
| 3113 |
|
| 3114 |
-
size_t quantize_q4_0(const float * restrict src, void * restrict dst,
|
| 3115 |
if (!quant_weights) {
|
| 3116 |
-
quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
|
| 3117 |
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3118 |
}
|
| 3119 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3120 |
char * qrow = (char *)dst;
|
| 3121 |
-
for (
|
| 3122 |
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
| 3123 |
src += n_per_row;
|
| 3124 |
qrow += row_size;
|
|
@@ -3126,7 +3126,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow,
|
|
| 3126 |
return nrow * row_size;
|
| 3127 |
}
|
| 3128 |
|
| 3129 |
-
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y,
|
| 3130 |
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
| 3131 |
|
| 3132 |
if (!quant_weights) {
|
|
@@ -3141,7 +3141,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
| 3141 |
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
| 3142 |
float sigma2 = sum_x2/n_per_row;
|
| 3143 |
|
| 3144 |
-
const
|
| 3145 |
for (int ib = 0; ib < nb; ++ib) {
|
| 3146 |
const float * xb = x + QK4_1 * ib;
|
| 3147 |
const float * qw = quant_weights + QK4_1 * ib;
|
|
@@ -3156,14 +3156,14 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
| 3156 |
}
|
| 3157 |
}
|
| 3158 |
|
| 3159 |
-
size_t quantize_q4_1(const float * restrict src, void * restrict dst,
|
| 3160 |
if (!quant_weights) {
|
| 3161 |
-
quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
|
| 3162 |
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3163 |
}
|
| 3164 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3165 |
char * qrow = (char *)dst;
|
| 3166 |
-
for (
|
| 3167 |
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
| 3168 |
src += n_per_row;
|
| 3169 |
qrow += row_size;
|
|
@@ -3171,7 +3171,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow,
|
|
| 3171 |
return nrow * row_size;
|
| 3172 |
}
|
| 3173 |
|
| 3174 |
-
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y,
|
| 3175 |
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
| 3176 |
|
| 3177 |
if (!quant_weights) {
|
|
@@ -3186,7 +3186,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
| 3186 |
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
| 3187 |
float sigma2 = sum_x2/n_per_row;
|
| 3188 |
|
| 3189 |
-
const
|
| 3190 |
for (int ib = 0; ib < nb; ++ib) {
|
| 3191 |
const float * xb = x + QK5_0 * ib;
|
| 3192 |
const float * qw = quant_weights + QK5_0 * ib;
|
|
@@ -3210,14 +3210,14 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
| 3210 |
}
|
| 3211 |
}
|
| 3212 |
|
| 3213 |
-
size_t quantize_q5_0(const float * restrict src, void * restrict dst,
|
| 3214 |
if (!quant_weights) {
|
| 3215 |
-
quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
|
| 3216 |
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3217 |
}
|
| 3218 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3219 |
char * qrow = (char *)dst;
|
| 3220 |
-
for (
|
| 3221 |
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
| 3222 |
src += n_per_row;
|
| 3223 |
qrow += row_size;
|
|
@@ -3225,7 +3225,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow,
|
|
| 3225 |
return nrow * row_size;
|
| 3226 |
}
|
| 3227 |
|
| 3228 |
-
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y,
|
| 3229 |
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
| 3230 |
|
| 3231 |
if (!quant_weights) {
|
|
@@ -3240,7 +3240,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
| 3240 |
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
| 3241 |
float sigma2 = sum_x2/n_per_row;
|
| 3242 |
|
| 3243 |
-
const
|
| 3244 |
for (int ib = 0; ib < nb; ++ib) {
|
| 3245 |
const float * xb = x + QK5_1 * ib;
|
| 3246 |
const float * qw = quant_weights + QK5_1 * ib;
|
|
@@ -3263,14 +3263,14 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
| 3263 |
}
|
| 3264 |
}
|
| 3265 |
|
| 3266 |
-
size_t quantize_q5_1(const float * restrict src, void * restrict dst,
|
| 3267 |
if (!quant_weights) {
|
| 3268 |
-
quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
|
| 3269 |
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3270 |
}
|
| 3271 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3272 |
char * qrow = (char *)dst;
|
| 3273 |
-
for (
|
| 3274 |
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
| 3275 |
src += n_per_row;
|
| 3276 |
qrow += row_size;
|
|
@@ -3278,18 +3278,18 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow,
|
|
| 3278 |
return nrow * row_size;
|
| 3279 |
}
|
| 3280 |
|
| 3281 |
-
size_t quantize_q8_0(const float * restrict src, void * restrict dst,
|
| 3282 |
(void)quant_weights; // not used
|
| 3283 |
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
| 3284 |
-
quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
|
| 3285 |
return nrow * row_size;
|
| 3286 |
}
|
| 3287 |
|
| 3288 |
// ====================== "True" 2-bit (de)-quantization
|
| 3289 |
|
| 3290 |
-
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y,
|
| 3291 |
assert(k % QK_K == 0);
|
| 3292 |
-
const
|
| 3293 |
|
| 3294 |
uint32_t aux32[2];
|
| 3295 |
const uint8_t * aux8 = (const uint8_t *)aux32;
|
|
@@ -3315,9 +3315,9 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
|
|
| 3315 |
|
| 3316 |
// ====================== 2.3125 bpw (de)-quantization
|
| 3317 |
|
| 3318 |
-
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
| 3319 |
assert(k % QK_K == 0);
|
| 3320 |
-
const
|
| 3321 |
|
| 3322 |
float db[2];
|
| 3323 |
|
|
@@ -3342,9 +3342,9 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
| 3342 |
|
| 3343 |
// ====================== 2.5625 bpw (de)-quantization
|
| 3344 |
|
| 3345 |
-
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y,
|
| 3346 |
assert(k % QK_K == 0);
|
| 3347 |
-
const
|
| 3348 |
|
| 3349 |
float db[2];
|
| 3350 |
|
|
@@ -3374,9 +3374,9 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
|
|
| 3374 |
|
| 3375 |
// ====================== 3.0625 bpw (de)-quantization
|
| 3376 |
|
| 3377 |
-
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y,
|
| 3378 |
assert(k % QK_K == 0);
|
| 3379 |
-
const
|
| 3380 |
|
| 3381 |
uint32_t aux32;
|
| 3382 |
|
|
@@ -3406,9 +3406,9 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
|
| 3406 |
|
| 3407 |
// ====================== 3.3125 bpw (de)-quantization
|
| 3408 |
|
| 3409 |
-
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y,
|
| 3410 |
assert(k % QK_K == 0);
|
| 3411 |
-
const
|
| 3412 |
|
| 3413 |
for (int i = 0; i < nb; i++) {
|
| 3414 |
|
|
@@ -3449,9 +3449,9 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
| 3449 |
|
| 3450 |
// ====================== 1.5625 bpw (de)-quantization
|
| 3451 |
|
| 3452 |
-
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y,
|
| 3453 |
assert(k % QK_K == 0);
|
| 3454 |
-
const
|
| 3455 |
|
| 3456 |
for (int i = 0; i < nb; i++) {
|
| 3457 |
|
|
@@ -3474,9 +3474,9 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
|
| 3474 |
}
|
| 3475 |
}
|
| 3476 |
|
| 3477 |
-
void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y,
|
| 3478 |
assert(k % QK_K == 0);
|
| 3479 |
-
const
|
| 3480 |
|
| 3481 |
float delta[4];
|
| 3482 |
uint16_t idx[4];
|
|
@@ -3535,9 +3535,9 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
|
|
| 3535 |
|
| 3536 |
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
| 3537 |
|
| 3538 |
-
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
| 3539 |
assert(k % QK4_NL == 0);
|
| 3540 |
-
const
|
| 3541 |
|
| 3542 |
for (int i = 0; i < nb; i++) {
|
| 3543 |
|
|
@@ -3553,12 +3553,12 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
|
| 3553 |
}
|
| 3554 |
}
|
| 3555 |
|
| 3556 |
-
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
| 3557 |
assert(k % QK_K == 0);
|
| 3558 |
#if QK_K == 64
|
| 3559 |
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
| 3560 |
#else
|
| 3561 |
-
const
|
| 3562 |
|
| 3563 |
for (int i = 0; i < nb; i++) {
|
| 3564 |
|
|
@@ -3582,9 +3582,9 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
|
| 3582 |
|
| 3583 |
//===================================== Q8_K ==============================================
|
| 3584 |
|
| 3585 |
-
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y,
|
| 3586 |
assert(k % QK_K == 0);
|
| 3587 |
-
const
|
| 3588 |
|
| 3589 |
for (int i = 0; i < nb; i++) {
|
| 3590 |
|
|
@@ -3621,9 +3621,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
|
|
| 3621 |
}
|
| 3622 |
}
|
| 3623 |
|
| 3624 |
-
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y,
|
| 3625 |
assert(k % QK_K == 0);
|
| 3626 |
-
const
|
| 3627 |
|
| 3628 |
for (int i = 0; i < nb; i++) {
|
| 3629 |
for (int j = 0; j < QK_K; ++j) {
|
|
@@ -3632,7 +3632,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
|
|
| 3632 |
}
|
| 3633 |
}
|
| 3634 |
|
| 3635 |
-
void quantize_row_q8_K(const float * restrict x, void * restrict y,
|
| 3636 |
quantize_row_q8_K_reference(x, y, k);
|
| 3637 |
}
|
| 3638 |
|
|
@@ -10648,7 +10648,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
| 10648 |
return grid_index;
|
| 10649 |
}
|
| 10650 |
|
| 10651 |
-
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy,
|
| 10652 |
|
| 10653 |
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
|
| 10654 |
|
|
@@ -10664,7 +10664,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
| 10664 |
|
| 10665 |
const int kMaxQ = 3;
|
| 10666 |
|
| 10667 |
-
const
|
| 10668 |
|
| 10669 |
block_iq2_xxs * y = vy;
|
| 10670 |
|
|
@@ -10821,7 +10821,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
| 10821 |
}
|
| 10822 |
}
|
| 10823 |
|
| 10824 |
-
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy,
|
| 10825 |
|
| 10826 |
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
|
| 10827 |
|
|
@@ -10837,7 +10837,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
| 10837 |
|
| 10838 |
const int kMaxQ = 3;
|
| 10839 |
|
| 10840 |
-
const
|
| 10841 |
|
| 10842 |
block_iq2_xs * y = vy;
|
| 10843 |
|
|
@@ -11001,11 +11001,11 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
| 11001 |
}
|
| 11002 |
}
|
| 11003 |
|
| 11004 |
-
size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst,
|
| 11005 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11006 |
-
|
| 11007 |
char * qrow = (char *)dst;
|
| 11008 |
-
for (
|
| 11009 |
quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
|
| 11010 |
src += n_per_row;
|
| 11011 |
qrow += nblock*sizeof(block_iq2_xxs);
|
|
@@ -11013,11 +11013,11 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nro
|
|
| 11013 |
return nrow * nblock * sizeof(block_iq2_xxs);
|
| 11014 |
}
|
| 11015 |
|
| 11016 |
-
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst,
|
| 11017 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11018 |
-
|
| 11019 |
char * qrow = (char *)dst;
|
| 11020 |
-
for (
|
| 11021 |
quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
|
| 11022 |
src += n_per_row;
|
| 11023 |
qrow += nblock*sizeof(block_iq2_xs);
|
|
@@ -11242,7 +11242,7 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
| 11242 |
return grid_index;
|
| 11243 |
}
|
| 11244 |
|
| 11245 |
-
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy,
|
| 11246 |
const float * restrict quant_weights) {
|
| 11247 |
|
| 11248 |
const int gindex = iq3_data_index(grid_size);
|
|
@@ -11259,7 +11259,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
| 11259 |
|
| 11260 |
const int kMaxQ = 8;
|
| 11261 |
|
| 11262 |
-
const
|
| 11263 |
|
| 11264 |
ggml_fp16_t * dh;
|
| 11265 |
uint8_t * qs;
|
|
@@ -11455,11 +11455,11 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
| 11455 |
}
|
| 11456 |
}
|
| 11457 |
|
| 11458 |
-
size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst,
|
| 11459 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11460 |
-
|
| 11461 |
char * qrow = (char *)dst;
|
| 11462 |
-
for (
|
| 11463 |
quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
|
| 11464 |
src += n_per_row;
|
| 11465 |
qrow += nblock*sizeof(block_iq3_xxs);
|
|
@@ -11467,13 +11467,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nro
|
|
| 11467 |
return nrow * nblock * sizeof(block_iq3_xxs);
|
| 11468 |
}
|
| 11469 |
|
| 11470 |
-
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy,
|
| 11471 |
assert(k % QK_K == 0);
|
| 11472 |
block_iq3_xxs * restrict y = vy;
|
| 11473 |
quantize_row_iq3_xxs_reference(x, y, k);
|
| 11474 |
}
|
| 11475 |
|
| 11476 |
-
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y,
|
| 11477 |
assert(k % QK_K == 0);
|
| 11478 |
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
| 11479 |
}
|
|
@@ -11504,7 +11504,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
| 11504 |
|
| 11505 |
const int kMaxQ = 8;
|
| 11506 |
|
| 11507 |
-
const
|
| 11508 |
|
| 11509 |
block_iq3_s * y = vy;
|
| 11510 |
|
|
@@ -11661,9 +11661,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
| 11661 |
}
|
| 11662 |
|
| 11663 |
#define IQ3S_BLOCK_SIZE 32
|
| 11664 |
-
size_t quantize_iq3_s(const float * restrict src, void * restrict dst,
|
| 11665 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11666 |
-
|
| 11667 |
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
| 11668 |
float weight[IQ3S_BLOCK_SIZE];
|
| 11669 |
float xval[IQ3S_BLOCK_SIZE];
|
|
@@ -11674,7 +11674,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
|
|
| 11674 |
bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
|
| 11675 |
uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
|
| 11676 |
char * qrow = (char *)dst;
|
| 11677 |
-
for (
|
| 11678 |
quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
|
| 11679 |
scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
|
| 11680 |
src += n_per_row;
|
|
@@ -11683,13 +11683,13 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
|
|
| 11683 |
return nrow * nblock * sizeof(block_iq3_s);
|
| 11684 |
}
|
| 11685 |
|
| 11686 |
-
void quantize_row_iq3_s(const float * restrict x, void * restrict vy,
|
| 11687 |
assert(k % QK_K == 0);
|
| 11688 |
block_iq3_s * restrict y = vy;
|
| 11689 |
quantize_row_iq3_s_reference(x, y, k);
|
| 11690 |
}
|
| 11691 |
|
| 11692 |
-
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y,
|
| 11693 |
assert(k % QK_K == 0);
|
| 11694 |
quantize_iq3_s(x, y, 1, k, NULL);
|
| 11695 |
}
|
|
@@ -11822,7 +11822,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
|
|
| 11822 |
|
| 11823 |
#define IQ1S_BLOCK_SIZE 32
|
| 11824 |
#define IQ1M_BLOCK_SIZE 16
|
| 11825 |
-
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy,
|
| 11826 |
float * scales,
|
| 11827 |
float * weight,
|
| 11828 |
float * sumx,
|
|
@@ -11846,7 +11846,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
| 11846 |
|
| 11847 |
block_iq1_s * y = vy;
|
| 11848 |
|
| 11849 |
-
const
|
| 11850 |
|
| 11851 |
const int block_size = IQ1S_BLOCK_SIZE;
|
| 11852 |
|
|
@@ -11980,7 +11980,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
| 11980 |
}
|
| 11981 |
}
|
| 11982 |
|
| 11983 |
-
size_t quantize_iq1_s(const float * restrict src, void * restrict dst,
|
| 11984 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11985 |
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
| 11986 |
float weight[IQ1S_BLOCK_SIZE];
|
|
@@ -11990,9 +11990,9 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
|
|
| 11990 |
float pairs[2*IQ1S_BLOCK_SIZE];
|
| 11991 |
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
| 11992 |
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
| 11993 |
-
|
| 11994 |
char * qrow = (char *)dst;
|
| 11995 |
-
for (
|
| 11996 |
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
|
| 11997 |
src += n_per_row;
|
| 11998 |
qrow += nblock*sizeof(block_iq1_s);
|
|
@@ -12000,7 +12000,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
|
|
| 12000 |
return nrow * nblock * sizeof(block_iq1_s);
|
| 12001 |
}
|
| 12002 |
|
| 12003 |
-
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy,
|
| 12004 |
float * scales,
|
| 12005 |
float * weight,
|
| 12006 |
float * pairs,
|
|
@@ -12022,7 +12022,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
|
| 12022 |
|
| 12023 |
block_iq1_m * y = vy;
|
| 12024 |
|
| 12025 |
-
const
|
| 12026 |
|
| 12027 |
const int block_size = IQ1M_BLOCK_SIZE;
|
| 12028 |
|
|
@@ -12265,7 +12265,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
|
| 12265 |
}
|
| 12266 |
}
|
| 12267 |
|
| 12268 |
-
size_t quantize_iq1_m(const float * restrict src, void * restrict dst,
|
| 12269 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 12270 |
float scales[QK_K/IQ1M_BLOCK_SIZE];
|
| 12271 |
float weight[IQ1M_BLOCK_SIZE];
|
|
@@ -12273,9 +12273,9 @@ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow,
|
|
| 12273 |
float pairs[2*IQ1M_BLOCK_SIZE];
|
| 12274 |
uint16_t index[IQ1M_BLOCK_SIZE/8];
|
| 12275 |
int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
|
| 12276 |
-
|
| 12277 |
char * qrow = (char *)dst;
|
| 12278 |
-
for (
|
| 12279 |
quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
|
| 12280 |
src += n_per_row;
|
| 12281 |
qrow += nblock*sizeof(block_iq1_m);
|
|
@@ -12407,16 +12407,16 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
| 12407 |
}
|
| 12408 |
}
|
| 12409 |
|
| 12410 |
-
size_t quantize_iq4_nl(const float * restrict src, void * restrict dst,
|
| 12411 |
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
| 12412 |
-
|
| 12413 |
char * qrow = (char *)dst;
|
| 12414 |
uint8_t L[QK4_NL];
|
| 12415 |
float weight[QK4_NL];
|
| 12416 |
uint16_t unused_h;
|
| 12417 |
uint8_t * unused_l = NULL;
|
| 12418 |
float scale;
|
| 12419 |
-
for (
|
| 12420 |
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
| 12421 |
for (int ibl = 0; ibl < nblock; ++ibl) {
|
| 12422 |
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
|
@@ -12429,9 +12429,9 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
|
|
| 12429 |
return nrow * nblock * sizeof(block_iq4_nl);
|
| 12430 |
}
|
| 12431 |
|
| 12432 |
-
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy,
|
| 12433 |
GGML_ASSERT(k%QK4_NL == 0);
|
| 12434 |
-
|
| 12435 |
uint8_t L[QK4_NL];
|
| 12436 |
float weight[QK4_NL];
|
| 12437 |
uint16_t unused_h;
|
|
@@ -12444,22 +12444,22 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
|
| 12444 |
}
|
| 12445 |
}
|
| 12446 |
|
| 12447 |
-
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y,
|
| 12448 |
assert(k % QK4_NL == 0);
|
| 12449 |
quantize_row_iq4_nl(x, y, k);
|
| 12450 |
}
|
| 12451 |
|
| 12452 |
-
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst,
|
| 12453 |
#if QK_K == 64
|
| 12454 |
return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
|
| 12455 |
#else
|
| 12456 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 12457 |
-
|
| 12458 |
char * qrow = (char *)dst;
|
| 12459 |
uint8_t L[QK_K];
|
| 12460 |
float weight[32];
|
| 12461 |
float scales[QK_K/32];
|
| 12462 |
-
for (
|
| 12463 |
block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
|
| 12464 |
for (int ibl = 0; ibl < nblock; ++ibl) {
|
| 12465 |
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
|
|
@@ -12473,20 +12473,20 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
|
|
| 12473 |
#endif
|
| 12474 |
}
|
| 12475 |
|
| 12476 |
-
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy,
|
| 12477 |
assert(k % QK_K == 0);
|
| 12478 |
block_iq4_xs * restrict y = vy;
|
| 12479 |
quantize_row_iq4_xs_reference(x, y, k);
|
| 12480 |
}
|
| 12481 |
|
| 12482 |
-
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y,
|
| 12483 |
assert(k % QK_K == 0);
|
| 12484 |
quantize_iq4_xs(x, y, 1, k, NULL);
|
| 12485 |
}
|
| 12486 |
|
| 12487 |
// =============================== 2.5625 bpw
|
| 12488 |
|
| 12489 |
-
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy,
|
| 12490 |
|
| 12491 |
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
| 12492 |
|
|
@@ -12501,7 +12501,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
| 12501 |
|
| 12502 |
const int kMaxQ = 3;
|
| 12503 |
|
| 12504 |
-
const
|
| 12505 |
|
| 12506 |
block_iq2_s * y = vy;
|
| 12507 |
|
|
@@ -12654,11 +12654,11 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
| 12654 |
}
|
| 12655 |
}
|
| 12656 |
|
| 12657 |
-
size_t quantize_iq2_s(const float * restrict src, void * restrict dst,
|
| 12658 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 12659 |
-
|
| 12660 |
char * qrow = (char *)dst;
|
| 12661 |
-
for (
|
| 12662 |
quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
|
| 12663 |
src += n_per_row;
|
| 12664 |
qrow += nblock*sizeof(block_iq2_s);
|
|
@@ -12666,12 +12666,12 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow,
|
|
| 12666 |
return nrow * nblock * sizeof(block_iq2_s);
|
| 12667 |
}
|
| 12668 |
|
| 12669 |
-
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y,
|
| 12670 |
assert(k % QK_K == 0);
|
| 12671 |
quantize_iq2_s(x, y, 1, k, NULL);
|
| 12672 |
}
|
| 12673 |
|
| 12674 |
-
void quantize_row_iq2_s(const float * restrict x, void * restrict vy,
|
| 12675 |
assert(k % QK_K == 0);
|
| 12676 |
block_iq2_s * restrict y = vy;
|
| 12677 |
quantize_row_iq2_s_reference(x, y, k);
|
|
|
|
| 544 |
#endif
|
| 545 |
|
| 546 |
// reference implementation for deterministic creation of model files
|
| 547 |
+
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
| 548 |
static const int qk = QK4_0;
|
| 549 |
|
| 550 |
assert(k % qk == 0);
|
|
|
|
| 581 |
}
|
| 582 |
}
|
| 583 |
|
| 584 |
+
void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
|
| 585 |
quantize_row_q4_0_reference(x, y, k);
|
| 586 |
}
|
| 587 |
|
| 588 |
|
| 589 |
+
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
| 590 |
const int qk = QK4_1;
|
| 591 |
|
| 592 |
assert(k % qk == 0);
|
|
|
|
| 623 |
}
|
| 624 |
}
|
| 625 |
|
| 626 |
+
void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
|
| 627 |
quantize_row_q4_1_reference(x, y, k);
|
| 628 |
}
|
| 629 |
|
| 630 |
+
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
|
| 631 |
static const int qk = QK5_0;
|
| 632 |
|
| 633 |
assert(k % qk == 0);
|
|
|
|
| 671 |
}
|
| 672 |
}
|
| 673 |
|
| 674 |
+
void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
|
| 675 |
quantize_row_q5_0_reference(x, y, k);
|
| 676 |
}
|
| 677 |
|
| 678 |
+
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
|
| 679 |
const int qk = QK5_1;
|
| 680 |
|
| 681 |
assert(k % qk == 0);
|
|
|
|
| 719 |
}
|
| 720 |
}
|
| 721 |
|
| 722 |
+
void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
|
| 723 |
quantize_row_q5_1_reference(x, y, k);
|
| 724 |
}
|
| 725 |
|
| 726 |
// reference implementation for deterministic creation of model files
|
| 727 |
+
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
|
| 728 |
assert(k % QK8_0 == 0);
|
| 729 |
const int nb = k / QK8_0;
|
| 730 |
|
|
|
|
| 749 |
}
|
| 750 |
}
|
| 751 |
|
| 752 |
+
void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
|
| 753 |
assert(QK8_0 == 32);
|
| 754 |
assert(k % QK8_0 == 0);
|
| 755 |
const int nb = k / QK8_0;
|
|
|
|
| 938 |
}
|
| 939 |
|
| 940 |
// reference implementation for deterministic creation of model files
|
| 941 |
+
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
|
| 942 |
assert(QK8_1 == 32);
|
| 943 |
assert(k % QK8_1 == 0);
|
| 944 |
const int nb = k / QK8_1;
|
|
|
|
| 973 |
}
|
| 974 |
}
|
| 975 |
|
| 976 |
+
void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
|
| 977 |
assert(k % QK8_1 == 0);
|
| 978 |
const int nb = k / QK8_1;
|
| 979 |
|
|
|
|
| 1192 |
#endif
|
| 1193 |
}
|
| 1194 |
|
| 1195 |
+
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
|
| 1196 |
static const int qk = QK4_0;
|
| 1197 |
|
| 1198 |
assert(k % qk == 0);
|
|
|
|
| 1212 |
}
|
| 1213 |
}
|
| 1214 |
|
| 1215 |
+
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
|
| 1216 |
static const int qk = QK4_1;
|
| 1217 |
|
| 1218 |
assert(k % qk == 0);
|
|
|
|
| 1233 |
}
|
| 1234 |
}
|
| 1235 |
|
| 1236 |
+
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
|
| 1237 |
static const int qk = QK5_0;
|
| 1238 |
|
| 1239 |
assert(k % qk == 0);
|
|
|
|
| 1259 |
}
|
| 1260 |
}
|
| 1261 |
|
| 1262 |
+
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
|
| 1263 |
static const int qk = QK5_1;
|
| 1264 |
|
| 1265 |
assert(k % qk == 0);
|
|
|
|
| 1286 |
}
|
| 1287 |
}
|
| 1288 |
|
| 1289 |
+
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
|
| 1290 |
static const int qk = QK8_0;
|
| 1291 |
|
| 1292 |
assert(k % qk == 0);
|
|
|
|
| 1581 |
|
| 1582 |
//========================- 2-bit (de)-quantization
|
| 1583 |
|
| 1584 |
+
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
|
| 1585 |
assert(k % QK_K == 0);
|
| 1586 |
const int nb = k / QK_K;
|
| 1587 |
|
|
|
|
| 1658 |
}
|
| 1659 |
}
|
| 1660 |
|
| 1661 |
+
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
|
| 1662 |
assert(k % QK_K == 0);
|
| 1663 |
const int nb = k / QK_K;
|
| 1664 |
|
|
|
|
| 1704 |
}
|
| 1705 |
}
|
| 1706 |
|
| 1707 |
+
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 1708 |
quantize_row_q2_K_reference(x, vy, k);
|
| 1709 |
}
|
| 1710 |
|
|
|
|
| 1960 |
}
|
| 1961 |
}
|
| 1962 |
|
| 1963 |
+
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 1964 |
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
| 1965 |
if (!quant_weights) {
|
| 1966 |
+
quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 1967 |
}
|
| 1968 |
else {
|
| 1969 |
char * qrow = (char *)dst;
|
| 1970 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 1971 |
quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
|
| 1972 |
src += n_per_row;
|
| 1973 |
qrow += row_size;
|
|
|
|
| 1978 |
|
| 1979 |
//========================= 3-bit (de)-quantization
|
| 1980 |
|
| 1981 |
+
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
|
| 1982 |
assert(k % QK_K == 0);
|
| 1983 |
const int nb = k / QK_K;
|
| 1984 |
|
|
|
|
| 2092 |
}
|
| 2093 |
|
| 2094 |
#if QK_K == 256
|
| 2095 |
+
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
|
| 2096 |
assert(k % QK_K == 0);
|
| 2097 |
const int nb = k / QK_K;
|
| 2098 |
|
|
|
|
| 2142 |
}
|
| 2143 |
}
|
| 2144 |
#else
|
| 2145 |
+
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
|
| 2146 |
assert(k % QK_K == 0);
|
| 2147 |
assert(QK_K == 64);
|
| 2148 |
const int nb = k / QK_K;
|
|
|
|
| 2175 |
}
|
| 2176 |
#endif
|
| 2177 |
|
| 2178 |
+
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2179 |
quantize_row_q3_K_reference(x, vy, k);
|
| 2180 |
}
|
| 2181 |
|
| 2182 |
+
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
|
| 2183 |
#if QK_K != 256
|
| 2184 |
(void)quant_weights;
|
| 2185 |
quantize_row_q3_K_reference(x, y, n_per_row);
|
|
|
|
| 2268 |
#endif
|
| 2269 |
}
|
| 2270 |
|
| 2271 |
+
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2272 |
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
| 2273 |
if (!quant_weights) {
|
| 2274 |
+
quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 2275 |
}
|
| 2276 |
else {
|
| 2277 |
char * qrow = (char *)dst;
|
| 2278 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 2279 |
quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
|
| 2280 |
src += n_per_row;
|
| 2281 |
qrow += row_size;
|
|
|
|
| 2286 |
|
| 2287 |
// ====================== 4-bit (de)-quantization
|
| 2288 |
|
| 2289 |
+
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
|
| 2290 |
assert(k % QK_K == 0);
|
| 2291 |
const int nb = k / QK_K;
|
| 2292 |
|
|
|
|
| 2393 |
}
|
| 2394 |
}
|
| 2395 |
|
| 2396 |
+
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
|
| 2397 |
assert(k % QK_K == 0);
|
| 2398 |
const int nb = k / QK_K;
|
| 2399 |
|
|
|
|
| 2432 |
}
|
| 2433 |
}
|
| 2434 |
|
| 2435 |
+
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2436 |
assert(k % QK_K == 0);
|
| 2437 |
block_q4_K * restrict y = vy;
|
| 2438 |
quantize_row_q4_K_reference(x, y, k);
|
| 2439 |
}
|
| 2440 |
|
| 2441 |
+
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
| 2442 |
#if QK_K != 256
|
| 2443 |
(void)quant_weights;
|
| 2444 |
quantize_row_q4_K_reference(x, y, n_per_row);
|
| 2445 |
#else
|
| 2446 |
assert(n_per_row % QK_K == 0);
|
| 2447 |
+
const int64_t nb = n_per_row / QK_K;
|
| 2448 |
|
| 2449 |
uint8_t L[QK_K];
|
| 2450 |
uint8_t Laux[32];
|
|
|
|
| 2516 |
#endif
|
| 2517 |
}
|
| 2518 |
|
| 2519 |
+
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2520 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
| 2521 |
if (!quant_weights) {
|
| 2522 |
+
quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 2523 |
}
|
| 2524 |
else {
|
| 2525 |
char * qrow = (char *)dst;
|
| 2526 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 2527 |
quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
|
| 2528 |
src += n_per_row;
|
| 2529 |
qrow += row_size;
|
|
|
|
| 2534 |
|
| 2535 |
// ====================== 5-bit (de)-quantization
|
| 2536 |
|
| 2537 |
+
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
|
| 2538 |
assert(k % QK_K == 0);
|
| 2539 |
+
const int64_t nb = k / QK_K;
|
| 2540 |
|
| 2541 |
#if QK_K == 256
|
| 2542 |
uint8_t L[QK_K];
|
|
|
|
| 2676 |
}
|
| 2677 |
}
|
| 2678 |
|
| 2679 |
+
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
|
| 2680 |
assert(k % QK_K == 0);
|
| 2681 |
+
const int64_t nb = k / QK_K;
|
| 2682 |
|
| 2683 |
for (int i = 0; i < nb; i++) {
|
| 2684 |
|
|
|
|
| 2721 |
}
|
| 2722 |
}
|
| 2723 |
|
| 2724 |
+
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2725 |
assert(k % QK_K == 0);
|
| 2726 |
block_q5_K * restrict y = vy;
|
| 2727 |
quantize_row_q5_K_reference(x, y, k);
|
| 2728 |
}
|
| 2729 |
|
| 2730 |
+
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
| 2731 |
#if QK_K != 256
|
| 2732 |
(void)quant_weights;
|
| 2733 |
quantize_row_q5_K_reference(x, y, n_per_row);
|
| 2734 |
#else
|
| 2735 |
assert(n_per_row % QK_K == 0);
|
| 2736 |
+
const int64_t nb = n_per_row / QK_K;
|
| 2737 |
|
| 2738 |
uint8_t L[QK_K];
|
| 2739 |
uint8_t Laux[32];
|
|
|
|
| 2825 |
#endif
|
| 2826 |
}
|
| 2827 |
|
| 2828 |
+
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2829 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
| 2830 |
if (!quant_weights) {
|
| 2831 |
+
quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 2832 |
}
|
| 2833 |
else {
|
| 2834 |
char * qrow = (char *)dst;
|
| 2835 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 2836 |
quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
|
| 2837 |
src += n_per_row;
|
| 2838 |
qrow += row_size;
|
|
|
|
| 2843 |
|
| 2844 |
// ====================== 6-bit (de)-quantization
|
| 2845 |
|
| 2846 |
+
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
|
| 2847 |
assert(k % QK_K == 0);
|
| 2848 |
+
const int64_t nb = k / QK_K;
|
| 2849 |
|
| 2850 |
int8_t L[QK_K];
|
| 2851 |
float scales[QK_K/16];
|
|
|
|
| 2925 |
}
|
| 2926 |
}
|
| 2927 |
|
| 2928 |
+
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
|
| 2929 |
assert(k % QK_K == 0);
|
| 2930 |
+
const int64_t nb = k / QK_K;
|
| 2931 |
|
| 2932 |
for (int i = 0; i < nb; i++) {
|
| 2933 |
|
|
|
|
| 2972 |
}
|
| 2973 |
}
|
| 2974 |
|
| 2975 |
+
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2976 |
assert(k % QK_K == 0);
|
| 2977 |
block_q6_K * restrict y = vy;
|
| 2978 |
quantize_row_q6_K_reference(x, y, k);
|
| 2979 |
}
|
| 2980 |
|
| 2981 |
+
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
| 2982 |
#if QK_K != 256
|
| 2983 |
(void)quant_weights;
|
| 2984 |
quantize_row_q6_K_reference(x, y, n_per_row);
|
| 2985 |
#else
|
| 2986 |
assert(n_per_row % QK_K == 0);
|
| 2987 |
+
const int64_t nb = n_per_row / QK_K;
|
| 2988 |
|
| 2989 |
int8_t L[QK_K];
|
| 2990 |
float scales[QK_K/16];
|
|
|
|
| 3067 |
#endif
|
| 3068 |
}
|
| 3069 |
|
| 3070 |
+
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3071 |
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
| 3072 |
if (!quant_weights) {
|
| 3073 |
+
quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 3074 |
}
|
| 3075 |
else {
|
| 3076 |
char * qrow = (char *)dst;
|
| 3077 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 3078 |
quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
|
| 3079 |
src += n_per_row;
|
| 3080 |
qrow += row_size;
|
|
|
|
| 3083 |
return nrow * row_size;
|
| 3084 |
}
|
| 3085 |
|
| 3086 |
+
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
| 3087 |
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
| 3088 |
|
| 3089 |
if (!quant_weights) {
|
|
|
|
| 3098 |
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
| 3099 |
float sigma2 = sum_x2/n_per_row;
|
| 3100 |
|
| 3101 |
+
const int64_t nb = n_per_row/QK4_0;
|
| 3102 |
for (int ib = 0; ib < nb; ++ib) {
|
| 3103 |
const float * xb = x + QK4_0 * ib;
|
| 3104 |
const float * qw = quant_weights + QK4_0 * ib;
|
|
|
|
| 3111 |
}
|
| 3112 |
}
|
| 3113 |
|
| 3114 |
+
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3115 |
if (!quant_weights) {
|
| 3116 |
+
quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 3117 |
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3118 |
}
|
| 3119 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3120 |
char * qrow = (char *)dst;
|
| 3121 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 3122 |
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
| 3123 |
src += n_per_row;
|
| 3124 |
qrow += row_size;
|
|
|
|
| 3126 |
return nrow * row_size;
|
| 3127 |
}
|
| 3128 |
|
| 3129 |
+
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
| 3130 |
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
| 3131 |
|
| 3132 |
if (!quant_weights) {
|
|
|
|
| 3141 |
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
| 3142 |
float sigma2 = sum_x2/n_per_row;
|
| 3143 |
|
| 3144 |
+
const int64_t nb = n_per_row/QK4_1;
|
| 3145 |
for (int ib = 0; ib < nb; ++ib) {
|
| 3146 |
const float * xb = x + QK4_1 * ib;
|
| 3147 |
const float * qw = quant_weights + QK4_1 * ib;
|
|
|
|
| 3156 |
}
|
| 3157 |
}
|
| 3158 |
|
| 3159 |
+
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3160 |
if (!quant_weights) {
|
| 3161 |
+
quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 3162 |
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3163 |
}
|
| 3164 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3165 |
char * qrow = (char *)dst;
|
| 3166 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 3167 |
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
| 3168 |
src += n_per_row;
|
| 3169 |
qrow += row_size;
|
|
|
|
| 3171 |
return nrow * row_size;
|
| 3172 |
}
|
| 3173 |
|
| 3174 |
+
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
| 3175 |
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
| 3176 |
|
| 3177 |
if (!quant_weights) {
|
|
|
|
| 3186 |
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
| 3187 |
float sigma2 = sum_x2/n_per_row;
|
| 3188 |
|
| 3189 |
+
const int64_t nb = n_per_row/QK5_0;
|
| 3190 |
for (int ib = 0; ib < nb; ++ib) {
|
| 3191 |
const float * xb = x + QK5_0 * ib;
|
| 3192 |
const float * qw = quant_weights + QK5_0 * ib;
|
|
|
|
| 3210 |
}
|
| 3211 |
}
|
| 3212 |
|
| 3213 |
+
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3214 |
if (!quant_weights) {
|
| 3215 |
+
quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 3216 |
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3217 |
}
|
| 3218 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3219 |
char * qrow = (char *)dst;
|
| 3220 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 3221 |
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
| 3222 |
src += n_per_row;
|
| 3223 |
qrow += row_size;
|
|
|
|
| 3225 |
return nrow * row_size;
|
| 3226 |
}
|
| 3227 |
|
| 3228 |
+
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
| 3229 |
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
| 3230 |
|
| 3231 |
if (!quant_weights) {
|
|
|
|
| 3240 |
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
| 3241 |
float sigma2 = sum_x2/n_per_row;
|
| 3242 |
|
| 3243 |
+
const int64_t nb = n_per_row/QK5_1;
|
| 3244 |
for (int ib = 0; ib < nb; ++ib) {
|
| 3245 |
const float * xb = x + QK5_1 * ib;
|
| 3246 |
const float * qw = quant_weights + QK5_1 * ib;
|
|
|
|
| 3263 |
}
|
| 3264 |
}
|
| 3265 |
|
| 3266 |
+
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3267 |
if (!quant_weights) {
|
| 3268 |
+
quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 3269 |
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3270 |
}
|
| 3271 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3272 |
char * qrow = (char *)dst;
|
| 3273 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 3274 |
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
| 3275 |
src += n_per_row;
|
| 3276 |
qrow += row_size;
|
|
|
|
| 3278 |
return nrow * row_size;
|
| 3279 |
}
|
| 3280 |
|
| 3281 |
+
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3282 |
(void)quant_weights; // not used
|
| 3283 |
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
| 3284 |
+
quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
| 3285 |
return nrow * row_size;
|
| 3286 |
}
|
| 3287 |
|
| 3288 |
// ====================== "True" 2-bit (de)-quantization
|
| 3289 |
|
| 3290 |
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
| 3291 |
assert(k % QK_K == 0);
|
| 3292 |
+
const int64_t nb = k / QK_K;
|
| 3293 |
|
| 3294 |
uint32_t aux32[2];
|
| 3295 |
const uint8_t * aux8 = (const uint8_t *)aux32;
|
|
|
|
| 3315 |
|
| 3316 |
// ====================== 2.3125 bpw (de)-quantization
|
| 3317 |
|
| 3318 |
+
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
|
| 3319 |
assert(k % QK_K == 0);
|
| 3320 |
+
const int64_t nb = k / QK_K;
|
| 3321 |
|
| 3322 |
float db[2];
|
| 3323 |
|
|
|
|
| 3342 |
|
| 3343 |
// ====================== 2.5625 bpw (de)-quantization
|
| 3344 |
|
| 3345 |
+
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
|
| 3346 |
assert(k % QK_K == 0);
|
| 3347 |
+
const int64_t nb = k / QK_K;
|
| 3348 |
|
| 3349 |
float db[2];
|
| 3350 |
|
|
|
|
| 3374 |
|
| 3375 |
// ====================== 3.0625 bpw (de)-quantization
|
| 3376 |
|
| 3377 |
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
|
| 3378 |
assert(k % QK_K == 0);
|
| 3379 |
+
const int64_t nb = k / QK_K;
|
| 3380 |
|
| 3381 |
uint32_t aux32;
|
| 3382 |
|
|
|
|
| 3406 |
|
| 3407 |
// ====================== 3.3125 bpw (de)-quantization
|
| 3408 |
|
| 3409 |
+
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
|
| 3410 |
assert(k % QK_K == 0);
|
| 3411 |
+
const int64_t nb = k / QK_K;
|
| 3412 |
|
| 3413 |
for (int i = 0; i < nb; i++) {
|
| 3414 |
|
|
|
|
| 3449 |
|
| 3450 |
// ====================== 1.5625 bpw (de)-quantization
|
| 3451 |
|
| 3452 |
+
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
|
| 3453 |
assert(k % QK_K == 0);
|
| 3454 |
+
const int64_t nb = k / QK_K;
|
| 3455 |
|
| 3456 |
for (int i = 0; i < nb; i++) {
|
| 3457 |
|
|
|
|
| 3474 |
}
|
| 3475 |
}
|
| 3476 |
|
| 3477 |
+
void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
|
| 3478 |
assert(k % QK_K == 0);
|
| 3479 |
+
const int64_t nb = k / QK_K;
|
| 3480 |
|
| 3481 |
float delta[4];
|
| 3482 |
uint16_t idx[4];
|
|
|
|
| 3535 |
|
| 3536 |
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
| 3537 |
|
| 3538 |
+
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
|
| 3539 |
assert(k % QK4_NL == 0);
|
| 3540 |
+
const int64_t nb = k / QK4_NL;
|
| 3541 |
|
| 3542 |
for (int i = 0; i < nb; i++) {
|
| 3543 |
|
|
|
|
| 3553 |
}
|
| 3554 |
}
|
| 3555 |
|
| 3556 |
+
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
|
| 3557 |
assert(k % QK_K == 0);
|
| 3558 |
#if QK_K == 64
|
| 3559 |
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
| 3560 |
#else
|
| 3561 |
+
const int64_t nb = k / QK_K;
|
| 3562 |
|
| 3563 |
for (int i = 0; i < nb; i++) {
|
| 3564 |
|
|
|
|
| 3582 |
|
| 3583 |
//===================================== Q8_K ==============================================
|
| 3584 |
|
| 3585 |
+
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
|
| 3586 |
assert(k % QK_K == 0);
|
| 3587 |
+
const int64_t nb = k / QK_K;
|
| 3588 |
|
| 3589 |
for (int i = 0; i < nb; i++) {
|
| 3590 |
|
|
|
|
| 3621 |
}
|
| 3622 |
}
|
| 3623 |
|
| 3624 |
+
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
|
| 3625 |
assert(k % QK_K == 0);
|
| 3626 |
+
const int64_t nb = k / QK_K;
|
| 3627 |
|
| 3628 |
for (int i = 0; i < nb; i++) {
|
| 3629 |
for (int j = 0; j < QK_K; ++j) {
|
|
|
|
| 3632 |
}
|
| 3633 |
}
|
| 3634 |
|
| 3635 |
+
void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
|
| 3636 |
quantize_row_q8_K_reference(x, y, k);
|
| 3637 |
}
|
| 3638 |
|
|
|
|
| 10648 |
return grid_index;
|
| 10649 |
}
|
| 10650 |
|
| 10651 |
+
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
| 10652 |
|
| 10653 |
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
|
| 10654 |
|
|
|
|
| 10664 |
|
| 10665 |
const int kMaxQ = 3;
|
| 10666 |
|
| 10667 |
+
const int64_t nbl = n/QK_K;
|
| 10668 |
|
| 10669 |
block_iq2_xxs * y = vy;
|
| 10670 |
|
|
|
|
| 10821 |
}
|
| 10822 |
}
|
| 10823 |
|
| 10824 |
+
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
| 10825 |
|
| 10826 |
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
|
| 10827 |
|
|
|
|
| 10837 |
|
| 10838 |
const int kMaxQ = 3;
|
| 10839 |
|
| 10840 |
+
const int64_t nbl = n/QK_K;
|
| 10841 |
|
| 10842 |
block_iq2_xs * y = vy;
|
| 10843 |
|
|
|
|
| 11001 |
}
|
| 11002 |
}
|
| 11003 |
|
| 11004 |
+
size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 11005 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11006 |
+
int64_t nblock = n_per_row/QK_K;
|
| 11007 |
char * qrow = (char *)dst;
|
| 11008 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 11009 |
quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
|
| 11010 |
src += n_per_row;
|
| 11011 |
qrow += nblock*sizeof(block_iq2_xxs);
|
|
|
|
| 11013 |
return nrow * nblock * sizeof(block_iq2_xxs);
|
| 11014 |
}
|
| 11015 |
|
| 11016 |
+
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 11017 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11018 |
+
int64_t nblock = n_per_row/QK_K;
|
| 11019 |
char * qrow = (char *)dst;
|
| 11020 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 11021 |
quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
|
| 11022 |
src += n_per_row;
|
| 11023 |
qrow += nblock*sizeof(block_iq2_xs);
|
|
|
|
| 11242 |
return grid_index;
|
| 11243 |
}
|
| 11244 |
|
| 11245 |
+
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
|
| 11246 |
const float * restrict quant_weights) {
|
| 11247 |
|
| 11248 |
const int gindex = iq3_data_index(grid_size);
|
|
|
|
| 11259 |
|
| 11260 |
const int kMaxQ = 8;
|
| 11261 |
|
| 11262 |
+
const int64_t nbl = n/QK_K;
|
| 11263 |
|
| 11264 |
ggml_fp16_t * dh;
|
| 11265 |
uint8_t * qs;
|
|
|
|
| 11455 |
}
|
| 11456 |
}
|
| 11457 |
|
| 11458 |
+
size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 11459 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11460 |
+
int64_t nblock = n_per_row/QK_K;
|
| 11461 |
char * qrow = (char *)dst;
|
| 11462 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 11463 |
quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
|
| 11464 |
src += n_per_row;
|
| 11465 |
qrow += nblock*sizeof(block_iq3_xxs);
|
|
|
|
| 11467 |
return nrow * nblock * sizeof(block_iq3_xxs);
|
| 11468 |
}
|
| 11469 |
|
| 11470 |
+
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
|
| 11471 |
assert(k % QK_K == 0);
|
| 11472 |
block_iq3_xxs * restrict y = vy;
|
| 11473 |
quantize_row_iq3_xxs_reference(x, y, k);
|
| 11474 |
}
|
| 11475 |
|
| 11476 |
+
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
|
| 11477 |
assert(k % QK_K == 0);
|
| 11478 |
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
| 11479 |
}
|
|
|
|
| 11504 |
|
| 11505 |
const int kMaxQ = 8;
|
| 11506 |
|
| 11507 |
+
const int64_t nbl = n/QK_K;
|
| 11508 |
|
| 11509 |
block_iq3_s * y = vy;
|
| 11510 |
|
|
|
|
| 11661 |
}
|
| 11662 |
|
| 11663 |
#define IQ3S_BLOCK_SIZE 32
|
| 11664 |
+
size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 11665 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11666 |
+
int64_t nblock = n_per_row/QK_K;
|
| 11667 |
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
| 11668 |
float weight[IQ3S_BLOCK_SIZE];
|
| 11669 |
float xval[IQ3S_BLOCK_SIZE];
|
|
|
|
| 11674 |
bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
|
| 11675 |
uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
|
| 11676 |
char * qrow = (char *)dst;
|
| 11677 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 11678 |
quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
|
| 11679 |
scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
|
| 11680 |
src += n_per_row;
|
|
|
|
| 11683 |
return nrow * nblock * sizeof(block_iq3_s);
|
| 11684 |
}
|
| 11685 |
|
| 11686 |
+
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
|
| 11687 |
assert(k % QK_K == 0);
|
| 11688 |
block_iq3_s * restrict y = vy;
|
| 11689 |
quantize_row_iq3_s_reference(x, y, k);
|
| 11690 |
}
|
| 11691 |
|
| 11692 |
+
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
|
| 11693 |
assert(k % QK_K == 0);
|
| 11694 |
quantize_iq3_s(x, y, 1, k, NULL);
|
| 11695 |
}
|
|
|
|
| 11822 |
|
| 11823 |
#define IQ1S_BLOCK_SIZE 32
|
| 11824 |
#define IQ1M_BLOCK_SIZE 16
|
| 11825 |
+
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
|
| 11826 |
float * scales,
|
| 11827 |
float * weight,
|
| 11828 |
float * sumx,
|
|
|
|
| 11846 |
|
| 11847 |
block_iq1_s * y = vy;
|
| 11848 |
|
| 11849 |
+
const int64_t nbl = n/QK_K;
|
| 11850 |
|
| 11851 |
const int block_size = IQ1S_BLOCK_SIZE;
|
| 11852 |
|
|
|
|
| 11980 |
}
|
| 11981 |
}
|
| 11982 |
|
| 11983 |
+
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 11984 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11985 |
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
| 11986 |
float weight[IQ1S_BLOCK_SIZE];
|
|
|
|
| 11990 |
float pairs[2*IQ1S_BLOCK_SIZE];
|
| 11991 |
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
| 11992 |
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
| 11993 |
+
int64_t nblock = n_per_row/QK_K;
|
| 11994 |
char * qrow = (char *)dst;
|
| 11995 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 11996 |
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
|
| 11997 |
src += n_per_row;
|
| 11998 |
qrow += nblock*sizeof(block_iq1_s);
|
|
|
|
| 12000 |
return nrow * nblock * sizeof(block_iq1_s);
|
| 12001 |
}
|
| 12002 |
|
| 12003 |
+
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
|
| 12004 |
float * scales,
|
| 12005 |
float * weight,
|
| 12006 |
float * pairs,
|
|
|
|
| 12022 |
|
| 12023 |
block_iq1_m * y = vy;
|
| 12024 |
|
| 12025 |
+
const int64_t nbl = n/QK_K;
|
| 12026 |
|
| 12027 |
const int block_size = IQ1M_BLOCK_SIZE;
|
| 12028 |
|
|
|
|
| 12265 |
}
|
| 12266 |
}
|
| 12267 |
|
| 12268 |
+
size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 12269 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 12270 |
float scales[QK_K/IQ1M_BLOCK_SIZE];
|
| 12271 |
float weight[IQ1M_BLOCK_SIZE];
|
|
|
|
| 12273 |
float pairs[2*IQ1M_BLOCK_SIZE];
|
| 12274 |
uint16_t index[IQ1M_BLOCK_SIZE/8];
|
| 12275 |
int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
|
| 12276 |
+
int64_t nblock = n_per_row/QK_K;
|
| 12277 |
char * qrow = (char *)dst;
|
| 12278 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 12279 |
quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
|
| 12280 |
src += n_per_row;
|
| 12281 |
qrow += nblock*sizeof(block_iq1_m);
|
|
|
|
| 12407 |
}
|
| 12408 |
}
|
| 12409 |
|
| 12410 |
+
size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 12411 |
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
| 12412 |
+
int64_t nblock = n_per_row/QK4_NL;
|
| 12413 |
char * qrow = (char *)dst;
|
| 12414 |
uint8_t L[QK4_NL];
|
| 12415 |
float weight[QK4_NL];
|
| 12416 |
uint16_t unused_h;
|
| 12417 |
uint8_t * unused_l = NULL;
|
| 12418 |
float scale;
|
| 12419 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 12420 |
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
| 12421 |
for (int ibl = 0; ibl < nblock; ++ibl) {
|
| 12422 |
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
|
|
|
| 12429 |
return nrow * nblock * sizeof(block_iq4_nl);
|
| 12430 |
}
|
| 12431 |
|
| 12432 |
+
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) {
|
| 12433 |
GGML_ASSERT(k%QK4_NL == 0);
|
| 12434 |
+
int64_t nblock = k/QK4_NL;
|
| 12435 |
uint8_t L[QK4_NL];
|
| 12436 |
float weight[QK4_NL];
|
| 12437 |
uint16_t unused_h;
|
|
|
|
| 12444 |
}
|
| 12445 |
}
|
| 12446 |
|
| 12447 |
+
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
|
| 12448 |
assert(k % QK4_NL == 0);
|
| 12449 |
quantize_row_iq4_nl(x, y, k);
|
| 12450 |
}
|
| 12451 |
|
| 12452 |
+
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 12453 |
#if QK_K == 64
|
| 12454 |
return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
|
| 12455 |
#else
|
| 12456 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 12457 |
+
int64_t nblock = n_per_row/QK_K;
|
| 12458 |
char * qrow = (char *)dst;
|
| 12459 |
uint8_t L[QK_K];
|
| 12460 |
float weight[32];
|
| 12461 |
float scales[QK_K/32];
|
| 12462 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 12463 |
block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
|
| 12464 |
for (int ibl = 0; ibl < nblock; ++ibl) {
|
| 12465 |
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
|
|
|
|
| 12473 |
#endif
|
| 12474 |
}
|
| 12475 |
|
| 12476 |
+
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
|
| 12477 |
assert(k % QK_K == 0);
|
| 12478 |
block_iq4_xs * restrict y = vy;
|
| 12479 |
quantize_row_iq4_xs_reference(x, y, k);
|
| 12480 |
}
|
| 12481 |
|
| 12482 |
+
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
|
| 12483 |
assert(k % QK_K == 0);
|
| 12484 |
quantize_iq4_xs(x, y, 1, k, NULL);
|
| 12485 |
}
|
| 12486 |
|
| 12487 |
// =============================== 2.5625 bpw
|
| 12488 |
|
| 12489 |
+
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
| 12490 |
|
| 12491 |
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
| 12492 |
|
|
|
|
| 12501 |
|
| 12502 |
const int kMaxQ = 3;
|
| 12503 |
|
| 12504 |
+
const int64_t nbl = n/QK_K;
|
| 12505 |
|
| 12506 |
block_iq2_s * y = vy;
|
| 12507 |
|
|
|
|
| 12654 |
}
|
| 12655 |
}
|
| 12656 |
|
| 12657 |
+
size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 12658 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 12659 |
+
int64_t nblock = n_per_row/QK_K;
|
| 12660 |
char * qrow = (char *)dst;
|
| 12661 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 12662 |
quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
|
| 12663 |
src += n_per_row;
|
| 12664 |
qrow += nblock*sizeof(block_iq2_s);
|
|
|
|
| 12666 |
return nrow * nblock * sizeof(block_iq2_s);
|
| 12667 |
}
|
| 12668 |
|
| 12669 |
+
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
|
| 12670 |
assert(k % QK_K == 0);
|
| 12671 |
quantize_iq2_s(x, y, 1, k, NULL);
|
| 12672 |
}
|
| 12673 |
|
| 12674 |
+
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
|
| 12675 |
assert(k % QK_K == 0);
|
| 12676 |
block_iq2_s * restrict y = vy;
|
| 12677 |
quantize_row_iq2_s_reference(x, y, k);
|
|
@@ -12,70 +12,70 @@ extern "C" {
|
|
| 12 |
#endif
|
| 13 |
|
| 14 |
// Quantization
|
| 15 |
-
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y,
|
| 16 |
-
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y,
|
| 17 |
-
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y,
|
| 18 |
-
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y,
|
| 19 |
-
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y,
|
| 20 |
-
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y,
|
| 21 |
-
|
| 22 |
-
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y,
|
| 23 |
-
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y,
|
| 24 |
-
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y,
|
| 25 |
-
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y,
|
| 26 |
-
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y,
|
| 27 |
-
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y,
|
| 28 |
-
|
| 29 |
-
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y,
|
| 30 |
-
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y,
|
| 31 |
-
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y,
|
| 32 |
-
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y,
|
| 33 |
-
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y,
|
| 34 |
-
|
| 35 |
-
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 36 |
-
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 37 |
-
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 38 |
-
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 39 |
-
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 40 |
-
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 41 |
-
|
| 42 |
-
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 43 |
-
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 44 |
-
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 45 |
-
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 46 |
-
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 47 |
-
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 48 |
-
|
| 49 |
-
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 50 |
-
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 51 |
-
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 52 |
-
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 53 |
-
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 54 |
|
| 55 |
// Dequantization
|
| 56 |
-
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 57 |
-
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 58 |
-
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 59 |
-
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 60 |
-
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 61 |
-
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 62 |
-
|
| 63 |
-
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 64 |
-
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 65 |
-
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 66 |
-
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 67 |
-
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 68 |
-
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 69 |
-
|
| 70 |
-
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 71 |
-
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 72 |
-
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 73 |
-
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 74 |
-
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 75 |
-
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 76 |
-
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 77 |
-
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 78 |
-
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 79 |
|
| 80 |
// Dot product
|
| 81 |
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
@@ -101,26 +101,26 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 101 |
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 102 |
|
| 103 |
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
| 104 |
-
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 105 |
-
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 106 |
-
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 107 |
-
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 108 |
-
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 109 |
-
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 110 |
-
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 111 |
-
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 112 |
-
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 113 |
-
|
| 114 |
-
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 115 |
-
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 116 |
-
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 117 |
-
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 118 |
-
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 119 |
-
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 120 |
-
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 121 |
-
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 122 |
-
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 123 |
-
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
| 124 |
|
| 125 |
void iq2xs_init_impl(enum ggml_type type);
|
| 126 |
void iq2xs_free_impl(enum ggml_type type);
|
|
|
|
| 12 |
#endif
|
| 13 |
|
| 14 |
// Quantization
|
| 15 |
+
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
| 16 |
+
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
| 17 |
+
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
| 18 |
+
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
| 19 |
+
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
| 20 |
+
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
| 21 |
+
|
| 22 |
+
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
| 23 |
+
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
| 24 |
+
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
| 25 |
+
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
| 26 |
+
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
| 27 |
+
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
| 28 |
+
|
| 29 |
+
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
| 30 |
+
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
| 31 |
+
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
| 32 |
+
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
| 33 |
+
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
| 34 |
+
|
| 35 |
+
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 36 |
+
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 37 |
+
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 38 |
+
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 39 |
+
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 40 |
+
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 41 |
+
|
| 42 |
+
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 43 |
+
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 44 |
+
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 45 |
+
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 46 |
+
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 47 |
+
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 48 |
+
|
| 49 |
+
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 50 |
+
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 51 |
+
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 52 |
+
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 53 |
+
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 54 |
|
| 55 |
// Dequantization
|
| 56 |
+
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 57 |
+
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 58 |
+
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 59 |
+
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 60 |
+
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 61 |
+
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 62 |
+
|
| 63 |
+
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 64 |
+
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 65 |
+
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 66 |
+
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 67 |
+
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 68 |
+
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 69 |
+
|
| 70 |
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 71 |
+
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 72 |
+
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 73 |
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 74 |
+
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 75 |
+
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 76 |
+
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 77 |
+
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 78 |
+
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 79 |
|
| 80 |
// Dot product
|
| 81 |
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
|
|
| 101 |
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 102 |
|
| 103 |
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
| 104 |
+
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 105 |
+
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 106 |
+
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 107 |
+
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 108 |
+
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 109 |
+
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 110 |
+
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 111 |
+
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 112 |
+
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 113 |
+
|
| 114 |
+
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 115 |
+
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 116 |
+
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 117 |
+
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 118 |
+
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 119 |
+
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 120 |
+
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 121 |
+
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 122 |
+
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 123 |
+
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
| 124 |
|
| 125 |
void iq2xs_init_impl(enum ggml_type type);
|
| 126 |
void iq2xs_free_impl(enum ggml_type type);
|
|
@@ -338,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
| 338 |
return GGML_FP32_TO_FP16(x);
|
| 339 |
}
|
| 340 |
|
| 341 |
-
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
| 342 |
-
for (
|
| 343 |
y[i] = GGML_FP16_TO_FP32(x[i]);
|
| 344 |
}
|
| 345 |
}
|
| 346 |
|
| 347 |
-
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
| 348 |
-
|
| 349 |
#if defined(__F16C__)
|
| 350 |
for (; i + 7 < n; i += 8) {
|
| 351 |
__m256 x_vec = _mm256_loadu_ps(x + i);
|
|
@@ -20331,11 +20331,11 @@ size_t ggml_quantize_chunk(
|
|
| 20331 |
enum ggml_type type,
|
| 20332 |
const float * src,
|
| 20333 |
void * dst,
|
| 20334 |
-
|
| 20335 |
-
|
| 20336 |
-
|
| 20337 |
const float * imatrix) {
|
| 20338 |
-
const
|
| 20339 |
|
| 20340 |
if (ggml_quantize_requires_imatrix(type)) {
|
| 20341 |
GGML_ASSERT(imatrix != NULL);
|
|
|
|
| 338 |
return GGML_FP32_TO_FP16(x);
|
| 339 |
}
|
| 340 |
|
| 341 |
+
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
| 342 |
+
for (int64_t i = 0; i < n; i++) {
|
| 343 |
y[i] = GGML_FP16_TO_FP32(x[i]);
|
| 344 |
}
|
| 345 |
}
|
| 346 |
|
| 347 |
+
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
| 348 |
+
int64_t i = 0;
|
| 349 |
#if defined(__F16C__)
|
| 350 |
for (; i + 7 < n; i += 8) {
|
| 351 |
__m256 x_vec = _mm256_loadu_ps(x + i);
|
|
|
|
| 20331 |
enum ggml_type type,
|
| 20332 |
const float * src,
|
| 20333 |
void * dst,
|
| 20334 |
+
int64_t start,
|
| 20335 |
+
int64_t nrows,
|
| 20336 |
+
int64_t n_per_row,
|
| 20337 |
const float * imatrix) {
|
| 20338 |
+
const int64_t n = (int64_t) nrows * n_per_row;
|
| 20339 |
|
| 20340 |
if (ggml_quantize_requires_imatrix(type)) {
|
| 20341 |
GGML_ASSERT(imatrix != NULL);
|
|
@@ -332,8 +332,8 @@ extern "C" {
|
|
| 332 |
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
| 333 |
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
| 334 |
|
| 335 |
-
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
| 336 |
-
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
| 337 |
|
| 338 |
struct ggml_object;
|
| 339 |
struct ggml_context;
|
|
@@ -2210,9 +2210,9 @@ extern "C" {
|
|
| 2210 |
enum ggml_type type,
|
| 2211 |
const float * src,
|
| 2212 |
void * dst,
|
| 2213 |
-
|
| 2214 |
-
|
| 2215 |
-
|
| 2216 |
const float * imatrix);
|
| 2217 |
|
| 2218 |
//
|
|
@@ -2377,8 +2377,8 @@ extern "C" {
|
|
| 2377 |
#else
|
| 2378 |
#define GGML_RESTRICT restrict
|
| 2379 |
#endif
|
| 2380 |
-
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
| 2381 |
-
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
| 2382 |
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
| 2383 |
const void * GGML_RESTRICT y, size_t by, int nrc);
|
| 2384 |
|
|
|
|
| 332 |
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
| 333 |
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
| 334 |
|
| 335 |
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
| 336 |
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
| 337 |
|
| 338 |
struct ggml_object;
|
| 339 |
struct ggml_context;
|
|
|
|
| 2210 |
enum ggml_type type,
|
| 2211 |
const float * src,
|
| 2212 |
void * dst,
|
| 2213 |
+
int64_t start,
|
| 2214 |
+
int64_t nrows,
|
| 2215 |
+
int64_t n_per_row,
|
| 2216 |
const float * imatrix);
|
| 2217 |
|
| 2218 |
//
|
|
|
|
| 2377 |
#else
|
| 2378 |
#define GGML_RESTRICT restrict
|
| 2379 |
#endif
|
| 2380 |
+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 2381 |
+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 2382 |
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
| 2383 |
const void * GGML_RESTRICT y, size_t by, int nrc);
|
| 2384 |
|