Carolinabanana S S slaren ggerganov commited on
Commit
8cf7097
·
unverified ·
1 Parent(s): 8499e3f

llama : add Command R Plus support (llama/6491)

Browse files

* Add Command R Plus GGUF

* Add Command R Plus GGUF

* Loading works up to LayerNorm2D

* Export new tensors in 1D so they are not quantized.

* Fix embedding layer based on Noeda's example

* Whitespace

* Add line

* Fix unexpected tokens on MPS. Re-add F16 fix. ((Noeda)

* dranger003: Fix block index overflow in CUDA dequantizing.

* Reverted blocked multiplication code as it still has issues and could affect other Llama arches

* export norms as f32

* fix overflow issues during quant and other cleanup

* Type convention

Co-authored-by: Georgi Gerganov <[email protected]>

* dranger003: Fix more int overflow during quant.

---------

Co-authored-by: S <[email protected]>
Co-authored-by: S <[email protected]>
Co-authored-by: slaren <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>

ggml-cuda.cu CHANGED
@@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1225
 
1226
  // the main device has a larger memory buffer to hold the results from all GPUs
1227
  // ldc == nrows of the matrix that cuBLAS writes into
1228
- int ldc = id == ctx.device ? ne0 : row_diff;
1229
 
1230
  const int compute_capability = ggml_cuda_info().devices[id].cc;
1231
 
@@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
1377
  const int64_t ne0 = dst->ne[0];
1378
  const int64_t ne1 = dst->ne[1];
1379
 
1380
- const int nb2 = dst->nb[2];
1381
- const int nb3 = dst->nb[3];
1382
 
1383
  GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1384
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
 
1225
 
1226
  // the main device has a larger memory buffer to hold the results from all GPUs
1227
  // ldc == nrows of the matrix that cuBLAS writes into
1228
+ int64_t ldc = id == ctx.device ? ne0 : row_diff;
1229
 
1230
  const int compute_capability = ggml_cuda_info().devices[id].cc;
1231
 
 
1377
  const int64_t ne0 = dst->ne[0];
1378
  const int64_t ne1 = dst->ne[1];
1379
 
1380
+ const int64_t nb2 = dst->nb[2];
1381
+ const int64_t nb3 = dst->nb[3];
1382
 
1383
  GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1384
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
ggml-cuda/common.cuh CHANGED
@@ -394,7 +394,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
394
  // TODO: move to ggml-common.h
395
  static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
396
 
397
- typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
398
 
399
 
400
  //////////////////////
 
394
  // TODO: move to ggml-common.h
395
  static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
396
 
397
+ typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
398
 
399
 
400
  //////////////////////
ggml-cuda/convert.cu CHANGED
@@ -4,14 +4,14 @@
4
  #define CUDA_Q8_0_NE_ALIGN 2048
5
 
6
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
7
- static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
8
- const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
9
 
10
  if (i >= k) {
11
  return;
12
  }
13
 
14
- const int ib = i/qk; // block index
15
  const int iqs = (i%qk)/qr; // quant index
16
  const int iybs = i - i%qk; // y block start index
17
  const int y_offset = qr == 1 ? 1 : qk/2;
@@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
25
  }
26
 
27
  template <bool need_check>
28
- static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
29
  #if __CUDA_ARCH__ >= CC_PASCAL
30
  constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
31
 
@@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
68
  template<typename dst_t>
69
  static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
70
 
71
- const int i = blockIdx.x;
72
 
73
  // assume 32 threads
74
  const int tid = threadIdx.x;
75
  const int il = tid/8;
76
  const int ir = tid%8;
77
- const int ib = 8*i + ir;
78
  if (ib >= nb32) {
79
  return;
80
  }
@@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
96
  template<typename dst_t>
97
  static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
98
 
99
- const int i = blockIdx.x;
100
 
101
  // assume 32 threads
102
  const int tid = threadIdx.x;
103
  const int il = tid/8;
104
  const int ir = tid%8;
105
- const int ib = 8*i + ir;
106
  if (ib >= nb32) {
107
  return;
108
  }
@@ -313,14 +313,14 @@ template<typename dst_t>
313
  static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
314
  const block_q6_K * x = (const block_q6_K *) vx;
315
 
316
- const int i = blockIdx.x;
317
  #if QK_K == 256
318
 
319
  // assume 64 threads - this is very slightly better than the one below
320
- const int tid = threadIdx.x;
321
- const int ip = tid/32; // ip is 0 or 1
322
- const int il = tid - 32*ip; // 0...32
323
- const int is = 8*ip + il/16;
324
 
325
  dst_t * y = yy + i*QK_K + 128*ip + il;
326
 
@@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
337
  #else
338
 
339
  // assume 32 threads
340
- const int tid = threadIdx.x;
341
- const int ip = tid/16; // 0 or 1
342
- const int il = tid - 16*ip; // 0...15
343
 
344
  dst_t * y = yy + i*QK_K + 16*ip + il;
345
 
@@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
571
  #endif
572
 
573
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
574
- static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
575
  const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
576
  dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
577
  }
578
 
579
- static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
580
  const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
581
  if (k % CUDA_Q8_0_NE_ALIGN == 0) {
582
  const bool need_check = false;
@@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
588
  }
589
 
590
  template<typename dst_t>
591
- static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
592
  const int nb = k / QK_K;
593
  #if QK_K == 256
594
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cu
598
  }
599
 
600
  template<typename dst_t>
601
- static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
602
  const int nb = k / QK_K;
603
  #if QK_K == 256
604
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
608
  }
609
 
610
  template<typename dst_t>
611
- static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
612
  const int nb32 = k / 32;
613
  const int nb = (k + 255) / 256;
614
  dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
615
  }
616
 
617
  template<typename dst_t>
618
- static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
619
  const int nb32 = k / 32;
620
  const int nb = (k + 255) / 256;
621
  dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
622
  }
623
 
624
  template<typename dst_t>
625
- static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
626
  const int nb = k / QK_K;
627
  dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
628
  }
629
 
630
  template<typename dst_t>
631
- static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
632
  const int nb = k / QK_K;
633
  #if QK_K == 256
634
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
638
  }
639
 
640
  template<typename dst_t>
641
- static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
642
  const int nb = k / QK_K;
643
  #if QK_K == 256
644
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
648
  }
649
 
650
  template<typename dst_t>
651
- static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
652
  const int nb = k / QK_K;
653
  dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
654
  }
655
 
656
  template<typename dst_t>
657
- static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
658
  const int nb = k / QK_K;
659
  dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
660
  }
661
 
662
  template<typename dst_t>
663
- static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
664
  const int nb = k / QK_K;
665
  dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
666
  }
667
 
668
  template<typename dst_t>
669
- static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
670
  const int nb = k / QK_K;
671
  dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
672
  }
673
 
674
  template<typename dst_t>
675
- static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
676
  const int nb = k / QK_K;
677
  dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
678
  }
679
 
680
  template<typename dst_t>
681
- static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
682
  const int nb = k / QK_K;
683
  dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
684
  }
685
 
686
  template<typename dst_t>
687
- static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
688
  const int nb = (k + QK_K - 1) / QK_K;
689
  dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
690
  }
691
 
692
  template<typename dst_t>
693
- static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
694
  const int nb = k / QK_K;
695
  dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
696
  }
697
 
698
  template<typename dst_t>
699
- static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
700
  const int nb = (k + QK_K - 1) / QK_K;
701
  #if QK_K == 64
702
  dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
@@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
706
  }
707
 
708
  template <typename src_t, typename dst_t>
709
- static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
710
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
711
 
712
  if (i >= k) {
713
  return;
@@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
719
  }
720
 
721
  template <typename src_t, typename dst_t>
722
- static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
723
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
724
  convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
725
  }
 
4
  #define CUDA_Q8_0_NE_ALIGN 2048
5
 
6
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
7
+ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
8
+ const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
9
 
10
  if (i >= k) {
11
  return;
12
  }
13
 
14
+ const int64_t ib = i/qk; // block index
15
  const int iqs = (i%qk)/qr; // quant index
16
  const int iybs = i - i%qk; // y block start index
17
  const int y_offset = qr == 1 ? 1 : qk/2;
 
25
  }
26
 
27
  template <bool need_check>
28
+ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
29
  #if __CUDA_ARCH__ >= CC_PASCAL
30
  constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
31
 
 
68
  template<typename dst_t>
69
  static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
70
 
71
+ const int64_t i = blockIdx.x;
72
 
73
  // assume 32 threads
74
  const int tid = threadIdx.x;
75
  const int il = tid/8;
76
  const int ir = tid%8;
77
+ const int64_t ib = 8*i + ir;
78
  if (ib >= nb32) {
79
  return;
80
  }
 
96
  template<typename dst_t>
97
  static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
98
 
99
+ const int64_t i = blockIdx.x;
100
 
101
  // assume 32 threads
102
  const int tid = threadIdx.x;
103
  const int il = tid/8;
104
  const int ir = tid%8;
105
+ const int64_t ib = 8*i + ir;
106
  if (ib >= nb32) {
107
  return;
108
  }
 
313
  static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
314
  const block_q6_K * x = (const block_q6_K *) vx;
315
 
316
+ const int64_t i = blockIdx.x;
317
  #if QK_K == 256
318
 
319
  // assume 64 threads - this is very slightly better than the one below
320
+ const int64_t tid = threadIdx.x;
321
+ const int64_t ip = tid/32; // ip is 0 or 1
322
+ const int64_t il = tid - 32*ip; // 0...32
323
+ const int64_t is = 8*ip + il/16;
324
 
325
  dst_t * y = yy + i*QK_K + 128*ip + il;
326
 
 
337
  #else
338
 
339
  // assume 32 threads
340
+ const int64_t tid = threadIdx.x;
341
+ const int64_t ip = tid/16; // 0 or 1
342
+ const int64_t il = tid - 16*ip; // 0...15
343
 
344
  dst_t * y = yy + i*QK_K + 16*ip + il;
345
 
 
571
  #endif
572
 
573
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
574
+ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
575
  const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
576
  dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
577
  }
578
 
579
+ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
580
  const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
581
  if (k % CUDA_Q8_0_NE_ALIGN == 0) {
582
  const bool need_check = false;
 
588
  }
589
 
590
  template<typename dst_t>
591
+ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
592
  const int nb = k / QK_K;
593
  #if QK_K == 256
594
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
 
598
  }
599
 
600
  template<typename dst_t>
601
+ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
602
  const int nb = k / QK_K;
603
  #if QK_K == 256
604
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
 
608
  }
609
 
610
  template<typename dst_t>
611
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
612
  const int nb32 = k / 32;
613
  const int nb = (k + 255) / 256;
614
  dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
615
  }
616
 
617
  template<typename dst_t>
618
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
619
  const int nb32 = k / 32;
620
  const int nb = (k + 255) / 256;
621
  dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
622
  }
623
 
624
  template<typename dst_t>
625
+ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
626
  const int nb = k / QK_K;
627
  dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
628
  }
629
 
630
  template<typename dst_t>
631
+ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
632
  const int nb = k / QK_K;
633
  #if QK_K == 256
634
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
 
638
  }
639
 
640
  template<typename dst_t>
641
+ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
642
  const int nb = k / QK_K;
643
  #if QK_K == 256
644
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
 
648
  }
649
 
650
  template<typename dst_t>
651
+ static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
652
  const int nb = k / QK_K;
653
  dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
654
  }
655
 
656
  template<typename dst_t>
657
+ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
658
  const int nb = k / QK_K;
659
  dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
660
  }
661
 
662
  template<typename dst_t>
663
+ static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
664
  const int nb = k / QK_K;
665
  dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
666
  }
667
 
668
  template<typename dst_t>
669
+ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
670
  const int nb = k / QK_K;
671
  dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
672
  }
673
 
674
  template<typename dst_t>
675
+ static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
676
  const int nb = k / QK_K;
677
  dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
678
  }
679
 
680
  template<typename dst_t>
681
+ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
682
  const int nb = k / QK_K;
683
  dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
684
  }
685
 
686
  template<typename dst_t>
687
+ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
688
  const int nb = (k + QK_K - 1) / QK_K;
689
  dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
690
  }
691
 
692
  template<typename dst_t>
693
+ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
694
  const int nb = k / QK_K;
695
  dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
696
  }
697
 
698
  template<typename dst_t>
699
+ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
700
  const int nb = (k + QK_K - 1) / QK_K;
701
  #if QK_K == 64
702
  dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
 
706
  }
707
 
708
  template <typename src_t, typename dst_t>
709
+ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
710
+ const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
711
 
712
  if (i >= k) {
713
  return;
 
719
  }
720
 
721
  template <typename src_t, typename dst_t>
722
+ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
723
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
724
  convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
725
  }
ggml-cuda/convert.cuh CHANGED
@@ -3,7 +3,7 @@
3
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
4
 
5
  template<typename T>
6
- using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
7
 
8
  typedef to_t_cuda_t<float> to_fp32_cuda_t;
9
  typedef to_t_cuda_t<half> to_fp16_cuda_t;
 
3
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
4
 
5
  template<typename T>
6
+ using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
7
 
8
  typedef to_t_cuda_t<float> to_fp32_cuda_t;
9
  typedef to_t_cuda_t<half> to_fp16_cuda_t;
ggml-cuda/dequantize.cuh CHANGED
@@ -1,6 +1,6 @@
1
  #include "common.cuh"
2
 
3
- static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
4
  const block_q4_0 * x = (const block_q4_0 *) vx;
5
 
6
  const dfloat d = x[ib].d;
@@ -19,7 +19,7 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
19
  #endif // GGML_CUDA_F16
20
  }
21
 
22
- static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
23
  const block_q4_1 * x = (const block_q4_1 *) vx;
24
 
25
  const dfloat d = __low2half(x[ib].dm);
@@ -39,7 +39,7 @@ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const in
39
  #endif // GGML_CUDA_F16
40
  }
41
 
42
- static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
43
  const block_q5_0 * x = (const block_q5_0 *) vx;
44
 
45
  const dfloat d = x[ib].d;
@@ -62,7 +62,7 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
62
  #endif // GGML_CUDA_F16
63
  }
64
 
65
- static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
66
  const block_q5_1 * x = (const block_q5_1 *) vx;
67
 
68
  const dfloat d = __low2half(x[ib].dm);
@@ -86,7 +86,7 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
86
  #endif // GGML_CUDA_F16
87
  }
88
 
89
- static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
90
  const block_q8_0 * x = (const block_q8_0 *) vx;
91
 
92
  const dfloat d = x[ib].d;
 
1
  #include "common.cuh"
2
 
3
+ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
4
  const block_q4_0 * x = (const block_q4_0 *) vx;
5
 
6
  const dfloat d = x[ib].d;
 
19
  #endif // GGML_CUDA_F16
20
  }
21
 
22
+ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
23
  const block_q4_1 * x = (const block_q4_1 *) vx;
24
 
25
  const dfloat d = __low2half(x[ib].dm);
 
39
  #endif // GGML_CUDA_F16
40
  }
41
 
42
+ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
43
  const block_q5_0 * x = (const block_q5_0 *) vx;
44
 
45
  const dfloat d = x[ib].d;
 
62
  #endif // GGML_CUDA_F16
63
  }
64
 
65
+ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
66
  const block_q5_1 * x = (const block_q5_1 *) vx;
67
 
68
  const dfloat d = __low2half(x[ib].dm);
 
86
  #endif // GGML_CUDA_F16
87
  }
88
 
89
+ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
90
  const block_q8_0 * x = (const block_q8_0 *) vx;
91
 
92
  const dfloat d = x[ib].d;
ggml-cuda/dmmv.cu CHANGED
@@ -565,7 +565,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
565
  }
566
  }
567
 
568
- static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
569
  const half * x = (const half *) vx;
570
 
571
  // automatic half -> float type cast if dfloat == float
@@ -577,7 +577,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
577
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
578
  // qk = quantized weights per x block
579
  // qr = number of quantized weights per data value in x block
580
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
581
 
582
  if (row >= nrows) {
583
  return;
@@ -598,7 +598,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
598
 
599
  for (int i = 0; i < ncols; i += iter_stride) {
600
  const int col = i + vals_per_iter*tid;
601
- const int ib = (row*ncols + col)/qk; // x block index
602
  const int iqs = (col%qk)/qr; // x quant index
603
  const int iybs = col - col%qk; // y block start index
604
 
 
565
  }
566
  }
567
 
568
+ static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
569
  const half * x = (const half *) vx;
570
 
571
  // automatic half -> float type cast if dfloat == float
 
577
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
578
  // qk = quantized weights per x block
579
  // qr = number of quantized weights per data value in x block
580
+ const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
581
 
582
  if (row >= nrows) {
583
  return;
 
598
 
599
  for (int i = 0; i < ncols; i += iter_stride) {
600
  const int col = i + vals_per_iter*tid;
601
+ const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
602
  const int iqs = (col%qk)/qr; // x quant index
603
  const int iybs = col - col%qk; // y block start index
604
 
ggml-cuda/quantize.cu CHANGED
@@ -1,20 +1,20 @@
1
  #include "quantize.cuh"
2
 
3
- static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
4
- const int ix = blockDim.x*blockIdx.x + threadIdx.x;
5
 
6
  if (ix >= kx_padded) {
7
  return;
8
  }
9
 
10
- const int iy = blockDim.y*blockIdx.y + threadIdx.y;
11
 
12
- const int i_padded = iy*kx_padded + ix;
13
 
14
  block_q8_1 * y = (block_q8_1 *) vy;
15
 
16
- const int ib = i_padded / QK8_1; // block index
17
- const int iqs = i_padded % QK8_1; // quant index
18
 
19
  const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
20
  float amax = fabsf(xi);
@@ -36,8 +36,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
36
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
37
  }
38
 
39
- void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
40
- const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
41
  const dim3 num_blocks(block_num_x, ky, 1);
42
  const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
43
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
 
1
  #include "quantize.cuh"
2
 
3
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
4
+ const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
5
 
6
  if (ix >= kx_padded) {
7
  return;
8
  }
9
 
10
+ const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
11
 
12
+ const int64_t i_padded = (int64_t)iy*kx_padded + ix;
13
 
14
  block_q8_1 * y = (block_q8_1 *) vy;
15
 
16
+ const int64_t ib = i_padded / QK8_1; // block index
17
+ const int64_t iqs = i_padded % QK8_1; // quant index
18
 
19
  const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
20
  float amax = fabsf(xi);
 
36
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
37
  }
38
 
39
+ void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
40
+ const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
41
  const dim3 num_blocks(block_num_x, ky, 1);
42
  const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
43
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
ggml-cuda/quantize.cuh CHANGED
@@ -2,4 +2,4 @@
2
 
3
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
4
 
5
- void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream);
 
2
 
3
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
4
 
5
+ void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
ggml-quants.c CHANGED
@@ -544,7 +544,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
544
  #endif
545
 
546
  // reference implementation for deterministic creation of model files
547
- void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
548
  static const int qk = QK4_0;
549
 
550
  assert(k % qk == 0);
@@ -581,12 +581,12 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
581
  }
582
  }
583
 
584
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
585
  quantize_row_q4_0_reference(x, y, k);
586
  }
587
 
588
 
589
- void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
590
  const int qk = QK4_1;
591
 
592
  assert(k % qk == 0);
@@ -623,11 +623,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
623
  }
624
  }
625
 
626
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
627
  quantize_row_q4_1_reference(x, y, k);
628
  }
629
 
630
- void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
631
  static const int qk = QK5_0;
632
 
633
  assert(k % qk == 0);
@@ -671,11 +671,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
671
  }
672
  }
673
 
674
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
675
  quantize_row_q5_0_reference(x, y, k);
676
  }
677
 
678
- void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
679
  const int qk = QK5_1;
680
 
681
  assert(k % qk == 0);
@@ -719,12 +719,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
719
  }
720
  }
721
 
722
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
723
  quantize_row_q5_1_reference(x, y, k);
724
  }
725
 
726
  // reference implementation for deterministic creation of model files
727
- void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
728
  assert(k % QK8_0 == 0);
729
  const int nb = k / QK8_0;
730
 
@@ -749,7 +749,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
749
  }
750
  }
751
 
752
- void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
753
  assert(QK8_0 == 32);
754
  assert(k % QK8_0 == 0);
755
  const int nb = k / QK8_0;
@@ -938,7 +938,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
938
  }
939
 
940
  // reference implementation for deterministic creation of model files
941
- void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
942
  assert(QK8_1 == 32);
943
  assert(k % QK8_1 == 0);
944
  const int nb = k / QK8_1;
@@ -973,7 +973,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
973
  }
974
  }
975
 
976
- void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
977
  assert(k % QK8_1 == 0);
978
  const int nb = k / QK8_1;
979
 
@@ -1192,7 +1192,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1192
  #endif
1193
  }
1194
 
1195
- void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
1196
  static const int qk = QK4_0;
1197
 
1198
  assert(k % qk == 0);
@@ -1212,7 +1212,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
1212
  }
1213
  }
1214
 
1215
- void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
1216
  static const int qk = QK4_1;
1217
 
1218
  assert(k % qk == 0);
@@ -1233,7 +1233,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
1233
  }
1234
  }
1235
 
1236
- void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
1237
  static const int qk = QK5_0;
1238
 
1239
  assert(k % qk == 0);
@@ -1259,7 +1259,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
1259
  }
1260
  }
1261
 
1262
- void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
1263
  static const int qk = QK5_1;
1264
 
1265
  assert(k % qk == 0);
@@ -1286,7 +1286,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
1286
  }
1287
  }
1288
 
1289
- void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
1290
  static const int qk = QK8_0;
1291
 
1292
  assert(k % qk == 0);
@@ -1581,7 +1581,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
1581
 
1582
  //========================- 2-bit (de)-quantization
1583
 
1584
- void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
1585
  assert(k % QK_K == 0);
1586
  const int nb = k / QK_K;
1587
 
@@ -1658,7 +1658,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
1658
  }
1659
  }
1660
 
1661
- void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
1662
  assert(k % QK_K == 0);
1663
  const int nb = k / QK_K;
1664
 
@@ -1704,7 +1704,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
1704
  }
1705
  }
1706
 
1707
- void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
1708
  quantize_row_q2_K_reference(x, vy, k);
1709
  }
1710
 
@@ -1960,14 +1960,14 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1960
  }
1961
  }
1962
 
1963
- size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
1964
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1965
  if (!quant_weights) {
1966
- quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
1967
  }
1968
  else {
1969
  char * qrow = (char *)dst;
1970
- for (int row = 0; row < nrow; ++row) {
1971
  quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
1972
  src += n_per_row;
1973
  qrow += row_size;
@@ -1978,7 +1978,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow,
1978
 
1979
  //========================= 3-bit (de)-quantization
1980
 
1981
- void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
1982
  assert(k % QK_K == 0);
1983
  const int nb = k / QK_K;
1984
 
@@ -2092,7 +2092,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
2092
  }
2093
 
2094
  #if QK_K == 256
2095
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
2096
  assert(k % QK_K == 0);
2097
  const int nb = k / QK_K;
2098
 
@@ -2142,7 +2142,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
2142
  }
2143
  }
2144
  #else
2145
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
2146
  assert(k % QK_K == 0);
2147
  assert(QK_K == 64);
2148
  const int nb = k / QK_K;
@@ -2175,11 +2175,11 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
2175
  }
2176
  #endif
2177
 
2178
- void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
2179
  quantize_row_q3_K_reference(x, vy, k);
2180
  }
2181
 
2182
- static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2183
  #if QK_K != 256
2184
  (void)quant_weights;
2185
  quantize_row_q3_K_reference(x, y, n_per_row);
@@ -2268,14 +2268,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
2268
  #endif
2269
  }
2270
 
2271
- size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2272
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2273
  if (!quant_weights) {
2274
- quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
2275
  }
2276
  else {
2277
  char * qrow = (char *)dst;
2278
- for (int row = 0; row < nrow; ++row) {
2279
  quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
2280
  src += n_per_row;
2281
  qrow += row_size;
@@ -2286,7 +2286,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow,
2286
 
2287
  // ====================== 4-bit (de)-quantization
2288
 
2289
- void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
2290
  assert(k % QK_K == 0);
2291
  const int nb = k / QK_K;
2292
 
@@ -2393,7 +2393,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
2393
  }
2394
  }
2395
 
2396
- void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
2397
  assert(k % QK_K == 0);
2398
  const int nb = k / QK_K;
2399
 
@@ -2432,19 +2432,19 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
2432
  }
2433
  }
2434
 
2435
- void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
2436
  assert(k % QK_K == 0);
2437
  block_q4_K * restrict y = vy;
2438
  quantize_row_q4_K_reference(x, y, k);
2439
  }
2440
 
2441
- static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2442
  #if QK_K != 256
2443
  (void)quant_weights;
2444
  quantize_row_q4_K_reference(x, y, n_per_row);
2445
  #else
2446
  assert(n_per_row % QK_K == 0);
2447
- const int nb = n_per_row / QK_K;
2448
 
2449
  uint8_t L[QK_K];
2450
  uint8_t Laux[32];
@@ -2516,14 +2516,14 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2516
  #endif
2517
  }
2518
 
2519
- size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2520
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2521
  if (!quant_weights) {
2522
- quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
2523
  }
2524
  else {
2525
  char * qrow = (char *)dst;
2526
- for (int row = 0; row < nrow; ++row) {
2527
  quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
2528
  src += n_per_row;
2529
  qrow += row_size;
@@ -2534,9 +2534,9 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow,
2534
 
2535
  // ====================== 5-bit (de)-quantization
2536
 
2537
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
2538
  assert(k % QK_K == 0);
2539
- const int nb = k / QK_K;
2540
 
2541
  #if QK_K == 256
2542
  uint8_t L[QK_K];
@@ -2676,9 +2676,9 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
2676
  }
2677
  }
2678
 
2679
- void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
2680
  assert(k % QK_K == 0);
2681
- const int nb = k / QK_K;
2682
 
2683
  for (int i = 0; i < nb; i++) {
2684
 
@@ -2721,19 +2721,19 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
2721
  }
2722
  }
2723
 
2724
- void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
2725
  assert(k % QK_K == 0);
2726
  block_q5_K * restrict y = vy;
2727
  quantize_row_q5_K_reference(x, y, k);
2728
  }
2729
 
2730
- static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2731
  #if QK_K != 256
2732
  (void)quant_weights;
2733
  quantize_row_q5_K_reference(x, y, n_per_row);
2734
  #else
2735
  assert(n_per_row % QK_K == 0);
2736
- const int nb = n_per_row / QK_K;
2737
 
2738
  uint8_t L[QK_K];
2739
  uint8_t Laux[32];
@@ -2825,14 +2825,14 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2825
  #endif
2826
  }
2827
 
2828
- size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2829
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2830
  if (!quant_weights) {
2831
- quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
2832
  }
2833
  else {
2834
  char * qrow = (char *)dst;
2835
- for (int row = 0; row < nrow; ++row) {
2836
  quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
2837
  src += n_per_row;
2838
  qrow += row_size;
@@ -2843,9 +2843,9 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow,
2843
 
2844
  // ====================== 6-bit (de)-quantization
2845
 
2846
- void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
2847
  assert(k % QK_K == 0);
2848
- const int nb = k / QK_K;
2849
 
2850
  int8_t L[QK_K];
2851
  float scales[QK_K/16];
@@ -2925,9 +2925,9 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2925
  }
2926
  }
2927
 
2928
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
2929
  assert(k % QK_K == 0);
2930
- const int nb = k / QK_K;
2931
 
2932
  for (int i = 0; i < nb; i++) {
2933
 
@@ -2972,19 +2972,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
2972
  }
2973
  }
2974
 
2975
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
2976
  assert(k % QK_K == 0);
2977
  block_q6_K * restrict y = vy;
2978
  quantize_row_q6_K_reference(x, y, k);
2979
  }
2980
 
2981
- static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
2982
  #if QK_K != 256
2983
  (void)quant_weights;
2984
  quantize_row_q6_K_reference(x, y, n_per_row);
2985
  #else
2986
  assert(n_per_row % QK_K == 0);
2987
- const int nb = n_per_row / QK_K;
2988
 
2989
  int8_t L[QK_K];
2990
  float scales[QK_K/16];
@@ -3067,14 +3067,14 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
3067
  #endif
3068
  }
3069
 
3070
- size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3071
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3072
  if (!quant_weights) {
3073
- quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
3074
  }
3075
  else {
3076
  char * qrow = (char *)dst;
3077
- for (int row = 0; row < nrow; ++row) {
3078
  quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
3079
  src += n_per_row;
3080
  qrow += row_size;
@@ -3083,7 +3083,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow,
3083
  return nrow * row_size;
3084
  }
3085
 
3086
- static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
3087
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
3088
 
3089
  if (!quant_weights) {
@@ -3098,7 +3098,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3098
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3099
  float sigma2 = sum_x2/n_per_row;
3100
 
3101
- const int nb = n_per_row/QK4_0;
3102
  for (int ib = 0; ib < nb; ++ib) {
3103
  const float * xb = x + QK4_0 * ib;
3104
  const float * qw = quant_weights + QK4_0 * ib;
@@ -3111,14 +3111,14 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3111
  }
3112
  }
3113
 
3114
- size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3115
  if (!quant_weights) {
3116
- quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
3117
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3118
  }
3119
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3120
  char * qrow = (char *)dst;
3121
- for (int row = 0; row < nrow; ++row) {
3122
  quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
3123
  src += n_per_row;
3124
  qrow += row_size;
@@ -3126,7 +3126,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow,
3126
  return nrow * row_size;
3127
  }
3128
 
3129
- static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
3130
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
3131
 
3132
  if (!quant_weights) {
@@ -3141,7 +3141,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3141
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3142
  float sigma2 = sum_x2/n_per_row;
3143
 
3144
- const int nb = n_per_row/QK4_1;
3145
  for (int ib = 0; ib < nb; ++ib) {
3146
  const float * xb = x + QK4_1 * ib;
3147
  const float * qw = quant_weights + QK4_1 * ib;
@@ -3156,14 +3156,14 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3156
  }
3157
  }
3158
 
3159
- size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3160
  if (!quant_weights) {
3161
- quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
3162
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3163
  }
3164
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3165
  char * qrow = (char *)dst;
3166
- for (int row = 0; row < nrow; ++row) {
3167
  quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
3168
  src += n_per_row;
3169
  qrow += row_size;
@@ -3171,7 +3171,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow,
3171
  return nrow * row_size;
3172
  }
3173
 
3174
- static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
3175
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
3176
 
3177
  if (!quant_weights) {
@@ -3186,7 +3186,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3186
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3187
  float sigma2 = sum_x2/n_per_row;
3188
 
3189
- const int nb = n_per_row/QK5_0;
3190
  for (int ib = 0; ib < nb; ++ib) {
3191
  const float * xb = x + QK5_0 * ib;
3192
  const float * qw = quant_weights + QK5_0 * ib;
@@ -3210,14 +3210,14 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3210
  }
3211
  }
3212
 
3213
- size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3214
  if (!quant_weights) {
3215
- quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
3216
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3217
  }
3218
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3219
  char * qrow = (char *)dst;
3220
- for (int row = 0; row < nrow; ++row) {
3221
  quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
3222
  src += n_per_row;
3223
  qrow += row_size;
@@ -3225,7 +3225,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow,
3225
  return nrow * row_size;
3226
  }
3227
 
3228
- static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
3229
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
3230
 
3231
  if (!quant_weights) {
@@ -3240,7 +3240,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3240
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3241
  float sigma2 = sum_x2/n_per_row;
3242
 
3243
- const int nb = n_per_row/QK5_1;
3244
  for (int ib = 0; ib < nb; ++ib) {
3245
  const float * xb = x + QK5_1 * ib;
3246
  const float * qw = quant_weights + QK5_1 * ib;
@@ -3263,14 +3263,14 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3263
  }
3264
  }
3265
 
3266
- size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3267
  if (!quant_weights) {
3268
- quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
3269
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3270
  }
3271
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3272
  char * qrow = (char *)dst;
3273
- for (int row = 0; row < nrow; ++row) {
3274
  quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
3275
  src += n_per_row;
3276
  qrow += row_size;
@@ -3278,18 +3278,18 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow,
3278
  return nrow * row_size;
3279
  }
3280
 
3281
- size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3282
  (void)quant_weights; // not used
3283
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3284
- quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
3285
  return nrow * row_size;
3286
  }
3287
 
3288
  // ====================== "True" 2-bit (de)-quantization
3289
 
3290
- void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
3291
  assert(k % QK_K == 0);
3292
- const int nb = k / QK_K;
3293
 
3294
  uint32_t aux32[2];
3295
  const uint8_t * aux8 = (const uint8_t *)aux32;
@@ -3315,9 +3315,9 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
3315
 
3316
  // ====================== 2.3125 bpw (de)-quantization
3317
 
3318
- void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
3319
  assert(k % QK_K == 0);
3320
- const int nb = k / QK_K;
3321
 
3322
  float db[2];
3323
 
@@ -3342,9 +3342,9 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
3342
 
3343
  // ====================== 2.5625 bpw (de)-quantization
3344
 
3345
- void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
3346
  assert(k % QK_K == 0);
3347
- const int nb = k / QK_K;
3348
 
3349
  float db[2];
3350
 
@@ -3374,9 +3374,9 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
3374
 
3375
  // ====================== 3.0625 bpw (de)-quantization
3376
 
3377
- void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
3378
  assert(k % QK_K == 0);
3379
- const int nb = k / QK_K;
3380
 
3381
  uint32_t aux32;
3382
 
@@ -3406,9 +3406,9 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
3406
 
3407
  // ====================== 3.3125 bpw (de)-quantization
3408
 
3409
- void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
3410
  assert(k % QK_K == 0);
3411
- const int nb = k / QK_K;
3412
 
3413
  for (int i = 0; i < nb; i++) {
3414
 
@@ -3449,9 +3449,9 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
3449
 
3450
  // ====================== 1.5625 bpw (de)-quantization
3451
 
3452
- void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
3453
  assert(k % QK_K == 0);
3454
- const int nb = k / QK_K;
3455
 
3456
  for (int i = 0; i < nb; i++) {
3457
 
@@ -3474,9 +3474,9 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
3474
  }
3475
  }
3476
 
3477
- void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
3478
  assert(k % QK_K == 0);
3479
- const int nb = k / QK_K;
3480
 
3481
  float delta[4];
3482
  uint16_t idx[4];
@@ -3535,9 +3535,9 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
3535
 
3536
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
3537
 
3538
- void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
3539
  assert(k % QK4_NL == 0);
3540
- const int nb = k / QK4_NL;
3541
 
3542
  for (int i = 0; i < nb; i++) {
3543
 
@@ -3553,12 +3553,12 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
3553
  }
3554
  }
3555
 
3556
- void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
3557
  assert(k % QK_K == 0);
3558
  #if QK_K == 64
3559
  dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
3560
  #else
3561
- const int nb = k / QK_K;
3562
 
3563
  for (int i = 0; i < nb; i++) {
3564
 
@@ -3582,9 +3582,9 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
3582
 
3583
  //===================================== Q8_K ==============================================
3584
 
3585
- void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
3586
  assert(k % QK_K == 0);
3587
- const int nb = k / QK_K;
3588
 
3589
  for (int i = 0; i < nb; i++) {
3590
 
@@ -3621,9 +3621,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
3621
  }
3622
  }
3623
 
3624
- void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
3625
  assert(k % QK_K == 0);
3626
- const int nb = k / QK_K;
3627
 
3628
  for (int i = 0; i < nb; i++) {
3629
  for (int j = 0; j < QK_K; ++j) {
@@ -3632,7 +3632,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
3632
  }
3633
  }
3634
 
3635
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
3636
  quantize_row_q8_K_reference(x, y, k);
3637
  }
3638
 
@@ -10648,7 +10648,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
10648
  return grid_index;
10649
  }
10650
 
10651
- static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
10652
 
10653
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
10654
 
@@ -10664,7 +10664,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
10664
 
10665
  const int kMaxQ = 3;
10666
 
10667
- const int nbl = n/QK_K;
10668
 
10669
  block_iq2_xxs * y = vy;
10670
 
@@ -10821,7 +10821,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
10821
  }
10822
  }
10823
 
10824
- static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
10825
 
10826
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
10827
 
@@ -10837,7 +10837,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
10837
 
10838
  const int kMaxQ = 3;
10839
 
10840
- const int nbl = n/QK_K;
10841
 
10842
  block_iq2_xs * y = vy;
10843
 
@@ -11001,11 +11001,11 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
11001
  }
11002
  }
11003
 
11004
- size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11005
  GGML_ASSERT(n_per_row%QK_K == 0);
11006
- int nblock = n_per_row/QK_K;
11007
  char * qrow = (char *)dst;
11008
- for (int row = 0; row < nrow; ++row) {
11009
  quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
11010
  src += n_per_row;
11011
  qrow += nblock*sizeof(block_iq2_xxs);
@@ -11013,11 +11013,11 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nro
11013
  return nrow * nblock * sizeof(block_iq2_xxs);
11014
  }
11015
 
11016
- size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11017
  GGML_ASSERT(n_per_row%QK_K == 0);
11018
- int nblock = n_per_row/QK_K;
11019
  char * qrow = (char *)dst;
11020
- for (int row = 0; row < nrow; ++row) {
11021
  quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
11022
  src += n_per_row;
11023
  qrow += nblock*sizeof(block_iq2_xs);
@@ -11242,7 +11242,7 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
11242
  return grid_index;
11243
  }
11244
 
11245
- static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
11246
  const float * restrict quant_weights) {
11247
 
11248
  const int gindex = iq3_data_index(grid_size);
@@ -11259,7 +11259,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11259
 
11260
  const int kMaxQ = 8;
11261
 
11262
- const int nbl = n/QK_K;
11263
 
11264
  ggml_fp16_t * dh;
11265
  uint8_t * qs;
@@ -11455,11 +11455,11 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11455
  }
11456
  }
11457
 
11458
- size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11459
  GGML_ASSERT(n_per_row%QK_K == 0);
11460
- int nblock = n_per_row/QK_K;
11461
  char * qrow = (char *)dst;
11462
- for (int row = 0; row < nrow; ++row) {
11463
  quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
11464
  src += n_per_row;
11465
  qrow += nblock*sizeof(block_iq3_xxs);
@@ -11467,13 +11467,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nro
11467
  return nrow * nblock * sizeof(block_iq3_xxs);
11468
  }
11469
 
11470
- void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
11471
  assert(k % QK_K == 0);
11472
  block_iq3_xxs * restrict y = vy;
11473
  quantize_row_iq3_xxs_reference(x, y, k);
11474
  }
11475
 
11476
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
11477
  assert(k % QK_K == 0);
11478
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
11479
  }
@@ -11504,7 +11504,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11504
 
11505
  const int kMaxQ = 8;
11506
 
11507
- const int nbl = n/QK_K;
11508
 
11509
  block_iq3_s * y = vy;
11510
 
@@ -11661,9 +11661,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11661
  }
11662
 
11663
  #define IQ3S_BLOCK_SIZE 32
11664
- size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11665
  GGML_ASSERT(n_per_row%QK_K == 0);
11666
- int nblock = n_per_row/QK_K;
11667
  float scales[QK_K/IQ3S_BLOCK_SIZE];
11668
  float weight[IQ3S_BLOCK_SIZE];
11669
  float xval[IQ3S_BLOCK_SIZE];
@@ -11674,7 +11674,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
11674
  bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
11675
  uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
11676
  char * qrow = (char *)dst;
11677
- for (int row = 0; row < nrow; ++row) {
11678
  quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
11679
  scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
11680
  src += n_per_row;
@@ -11683,13 +11683,13 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
11683
  return nrow * nblock * sizeof(block_iq3_s);
11684
  }
11685
 
11686
- void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
11687
  assert(k % QK_K == 0);
11688
  block_iq3_s * restrict y = vy;
11689
  quantize_row_iq3_s_reference(x, y, k);
11690
  }
11691
 
11692
- void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
11693
  assert(k % QK_K == 0);
11694
  quantize_iq3_s(x, y, 1, k, NULL);
11695
  }
@@ -11822,7 +11822,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
11822
 
11823
  #define IQ1S_BLOCK_SIZE 32
11824
  #define IQ1M_BLOCK_SIZE 16
11825
- static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
11826
  float * scales,
11827
  float * weight,
11828
  float * sumx,
@@ -11846,7 +11846,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11846
 
11847
  block_iq1_s * y = vy;
11848
 
11849
- const int nbl = n/QK_K;
11850
 
11851
  const int block_size = IQ1S_BLOCK_SIZE;
11852
 
@@ -11980,7 +11980,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11980
  }
11981
  }
11982
 
11983
- size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11984
  GGML_ASSERT(n_per_row%QK_K == 0);
11985
  float scales[QK_K/IQ1S_BLOCK_SIZE];
11986
  float weight[IQ1S_BLOCK_SIZE];
@@ -11990,9 +11990,9 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
11990
  float pairs[2*IQ1S_BLOCK_SIZE];
11991
  uint16_t index[IQ1S_BLOCK_SIZE/8];
11992
  int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11993
- int nblock = n_per_row/QK_K;
11994
  char * qrow = (char *)dst;
11995
- for (int row = 0; row < nrow; ++row) {
11996
  quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
11997
  src += n_per_row;
11998
  qrow += nblock*sizeof(block_iq1_s);
@@ -12000,7 +12000,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
12000
  return nrow * nblock * sizeof(block_iq1_s);
12001
  }
12002
 
12003
- static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
12004
  float * scales,
12005
  float * weight,
12006
  float * pairs,
@@ -12022,7 +12022,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
12022
 
12023
  block_iq1_m * y = vy;
12024
 
12025
- const int nbl = n/QK_K;
12026
 
12027
  const int block_size = IQ1M_BLOCK_SIZE;
12028
 
@@ -12265,7 +12265,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
12265
  }
12266
  }
12267
 
12268
- size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12269
  GGML_ASSERT(n_per_row%QK_K == 0);
12270
  float scales[QK_K/IQ1M_BLOCK_SIZE];
12271
  float weight[IQ1M_BLOCK_SIZE];
@@ -12273,9 +12273,9 @@ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow,
12273
  float pairs[2*IQ1M_BLOCK_SIZE];
12274
  uint16_t index[IQ1M_BLOCK_SIZE/8];
12275
  int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
12276
- int nblock = n_per_row/QK_K;
12277
  char * qrow = (char *)dst;
12278
- for (int row = 0; row < nrow; ++row) {
12279
  quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
12280
  src += n_per_row;
12281
  qrow += nblock*sizeof(block_iq1_m);
@@ -12407,16 +12407,16 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
12407
  }
12408
  }
12409
 
12410
- size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12411
  GGML_ASSERT(n_per_row%QK4_NL == 0);
12412
- int nblock = n_per_row/QK4_NL;
12413
  char * qrow = (char *)dst;
12414
  uint8_t L[QK4_NL];
12415
  float weight[QK4_NL];
12416
  uint16_t unused_h;
12417
  uint8_t * unused_l = NULL;
12418
  float scale;
12419
- for (int row = 0; row < nrow; ++row) {
12420
  block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
12421
  for (int ibl = 0; ibl < nblock; ++ibl) {
12422
  const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
@@ -12429,9 +12429,9 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
12429
  return nrow * nblock * sizeof(block_iq4_nl);
12430
  }
12431
 
12432
- void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12433
  GGML_ASSERT(k%QK4_NL == 0);
12434
- int nblock = k/QK4_NL;
12435
  uint8_t L[QK4_NL];
12436
  float weight[QK4_NL];
12437
  uint16_t unused_h;
@@ -12444,22 +12444,22 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12444
  }
12445
  }
12446
 
12447
- void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
12448
  assert(k % QK4_NL == 0);
12449
  quantize_row_iq4_nl(x, y, k);
12450
  }
12451
 
12452
- size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12453
  #if QK_K == 64
12454
  return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
12455
  #else
12456
  GGML_ASSERT(n_per_row%QK_K == 0);
12457
- int nblock = n_per_row/QK_K;
12458
  char * qrow = (char *)dst;
12459
  uint8_t L[QK_K];
12460
  float weight[32];
12461
  float scales[QK_K/32];
12462
- for (int row = 0; row < nrow; ++row) {
12463
  block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
12464
  for (int ibl = 0; ibl < nblock; ++ibl) {
12465
  const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
@@ -12473,20 +12473,20 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
12473
  #endif
12474
  }
12475
 
12476
- void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
12477
  assert(k % QK_K == 0);
12478
  block_iq4_xs * restrict y = vy;
12479
  quantize_row_iq4_xs_reference(x, y, k);
12480
  }
12481
 
12482
- void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
12483
  assert(k % QK_K == 0);
12484
  quantize_iq4_xs(x, y, 1, k, NULL);
12485
  }
12486
 
12487
  // =============================== 2.5625 bpw
12488
 
12489
- static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12490
 
12491
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
12492
 
@@ -12501,7 +12501,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
12501
 
12502
  const int kMaxQ = 3;
12503
 
12504
- const int nbl = n/QK_K;
12505
 
12506
  block_iq2_s * y = vy;
12507
 
@@ -12654,11 +12654,11 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
12654
  }
12655
  }
12656
 
12657
- size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12658
  GGML_ASSERT(n_per_row%QK_K == 0);
12659
- int nblock = n_per_row/QK_K;
12660
  char * qrow = (char *)dst;
12661
- for (int row = 0; row < nrow; ++row) {
12662
  quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
12663
  src += n_per_row;
12664
  qrow += nblock*sizeof(block_iq2_s);
@@ -12666,12 +12666,12 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow,
12666
  return nrow * nblock * sizeof(block_iq2_s);
12667
  }
12668
 
12669
- void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
12670
  assert(k % QK_K == 0);
12671
  quantize_iq2_s(x, y, 1, k, NULL);
12672
  }
12673
 
12674
- void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
12675
  assert(k % QK_K == 0);
12676
  block_iq2_s * restrict y = vy;
12677
  quantize_row_iq2_s_reference(x, y, k);
 
544
  #endif
545
 
546
  // reference implementation for deterministic creation of model files
547
+ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
548
  static const int qk = QK4_0;
549
 
550
  assert(k % qk == 0);
 
581
  }
582
  }
583
 
584
+ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
585
  quantize_row_q4_0_reference(x, y, k);
586
  }
587
 
588
 
589
+ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
590
  const int qk = QK4_1;
591
 
592
  assert(k % qk == 0);
 
623
  }
624
  }
625
 
626
+ void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
627
  quantize_row_q4_1_reference(x, y, k);
628
  }
629
 
630
+ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
631
  static const int qk = QK5_0;
632
 
633
  assert(k % qk == 0);
 
671
  }
672
  }
673
 
674
+ void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
675
  quantize_row_q5_0_reference(x, y, k);
676
  }
677
 
678
+ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
679
  const int qk = QK5_1;
680
 
681
  assert(k % qk == 0);
 
719
  }
720
  }
721
 
722
+ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
723
  quantize_row_q5_1_reference(x, y, k);
724
  }
725
 
726
  // reference implementation for deterministic creation of model files
727
+ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
728
  assert(k % QK8_0 == 0);
729
  const int nb = k / QK8_0;
730
 
 
749
  }
750
  }
751
 
752
+ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
753
  assert(QK8_0 == 32);
754
  assert(k % QK8_0 == 0);
755
  const int nb = k / QK8_0;
 
938
  }
939
 
940
  // reference implementation for deterministic creation of model files
941
+ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
942
  assert(QK8_1 == 32);
943
  assert(k % QK8_1 == 0);
944
  const int nb = k / QK8_1;
 
973
  }
974
  }
975
 
976
+ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
977
  assert(k % QK8_1 == 0);
978
  const int nb = k / QK8_1;
979
 
 
1192
  #endif
1193
  }
1194
 
1195
+ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
1196
  static const int qk = QK4_0;
1197
 
1198
  assert(k % qk == 0);
 
1212
  }
1213
  }
1214
 
1215
+ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
1216
  static const int qk = QK4_1;
1217
 
1218
  assert(k % qk == 0);
 
1233
  }
1234
  }
1235
 
1236
+ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
1237
  static const int qk = QK5_0;
1238
 
1239
  assert(k % qk == 0);
 
1259
  }
1260
  }
1261
 
1262
+ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
1263
  static const int qk = QK5_1;
1264
 
1265
  assert(k % qk == 0);
 
1286
  }
1287
  }
1288
 
1289
+ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
1290
  static const int qk = QK8_0;
1291
 
1292
  assert(k % qk == 0);
 
1581
 
1582
  //========================- 2-bit (de)-quantization
1583
 
1584
+ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
1585
  assert(k % QK_K == 0);
1586
  const int nb = k / QK_K;
1587
 
 
1658
  }
1659
  }
1660
 
1661
+ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
1662
  assert(k % QK_K == 0);
1663
  const int nb = k / QK_K;
1664
 
 
1704
  }
1705
  }
1706
 
1707
+ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
1708
  quantize_row_q2_K_reference(x, vy, k);
1709
  }
1710
 
 
1960
  }
1961
  }
1962
 
1963
+ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1964
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1965
  if (!quant_weights) {
1966
+ quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
1967
  }
1968
  else {
1969
  char * qrow = (char *)dst;
1970
+ for (int64_t row = 0; row < nrow; ++row) {
1971
  quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
1972
  src += n_per_row;
1973
  qrow += row_size;
 
1978
 
1979
  //========================= 3-bit (de)-quantization
1980
 
1981
+ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
1982
  assert(k % QK_K == 0);
1983
  const int nb = k / QK_K;
1984
 
 
2092
  }
2093
 
2094
  #if QK_K == 256
2095
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
2096
  assert(k % QK_K == 0);
2097
  const int nb = k / QK_K;
2098
 
 
2142
  }
2143
  }
2144
  #else
2145
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
2146
  assert(k % QK_K == 0);
2147
  assert(QK_K == 64);
2148
  const int nb = k / QK_K;
 
2175
  }
2176
  #endif
2177
 
2178
+ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
2179
  quantize_row_q3_K_reference(x, vy, k);
2180
  }
2181
 
2182
+ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
2183
  #if QK_K != 256
2184
  (void)quant_weights;
2185
  quantize_row_q3_K_reference(x, y, n_per_row);
 
2268
  #endif
2269
  }
2270
 
2271
+ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2272
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2273
  if (!quant_weights) {
2274
+ quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
2275
  }
2276
  else {
2277
  char * qrow = (char *)dst;
2278
+ for (int64_t row = 0; row < nrow; ++row) {
2279
  quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
2280
  src += n_per_row;
2281
  qrow += row_size;
 
2286
 
2287
  // ====================== 4-bit (de)-quantization
2288
 
2289
+ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
2290
  assert(k % QK_K == 0);
2291
  const int nb = k / QK_K;
2292
 
 
2393
  }
2394
  }
2395
 
2396
+ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
2397
  assert(k % QK_K == 0);
2398
  const int nb = k / QK_K;
2399
 
 
2432
  }
2433
  }
2434
 
2435
+ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
2436
  assert(k % QK_K == 0);
2437
  block_q4_K * restrict y = vy;
2438
  quantize_row_q4_K_reference(x, y, k);
2439
  }
2440
 
2441
+ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2442
  #if QK_K != 256
2443
  (void)quant_weights;
2444
  quantize_row_q4_K_reference(x, y, n_per_row);
2445
  #else
2446
  assert(n_per_row % QK_K == 0);
2447
+ const int64_t nb = n_per_row / QK_K;
2448
 
2449
  uint8_t L[QK_K];
2450
  uint8_t Laux[32];
 
2516
  #endif
2517
  }
2518
 
2519
+ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2520
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2521
  if (!quant_weights) {
2522
+ quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
2523
  }
2524
  else {
2525
  char * qrow = (char *)dst;
2526
+ for (int64_t row = 0; row < nrow; ++row) {
2527
  quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
2528
  src += n_per_row;
2529
  qrow += row_size;
 
2534
 
2535
  // ====================== 5-bit (de)-quantization
2536
 
2537
+ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
2538
  assert(k % QK_K == 0);
2539
+ const int64_t nb = k / QK_K;
2540
 
2541
  #if QK_K == 256
2542
  uint8_t L[QK_K];
 
2676
  }
2677
  }
2678
 
2679
+ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
2680
  assert(k % QK_K == 0);
2681
+ const int64_t nb = k / QK_K;
2682
 
2683
  for (int i = 0; i < nb; i++) {
2684
 
 
2721
  }
2722
  }
2723
 
2724
+ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
2725
  assert(k % QK_K == 0);
2726
  block_q5_K * restrict y = vy;
2727
  quantize_row_q5_K_reference(x, y, k);
2728
  }
2729
 
2730
+ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2731
  #if QK_K != 256
2732
  (void)quant_weights;
2733
  quantize_row_q5_K_reference(x, y, n_per_row);
2734
  #else
2735
  assert(n_per_row % QK_K == 0);
2736
+ const int64_t nb = n_per_row / QK_K;
2737
 
2738
  uint8_t L[QK_K];
2739
  uint8_t Laux[32];
 
2825
  #endif
2826
  }
2827
 
2828
+ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2829
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2830
  if (!quant_weights) {
2831
+ quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
2832
  }
2833
  else {
2834
  char * qrow = (char *)dst;
2835
+ for (int64_t row = 0; row < nrow; ++row) {
2836
  quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
2837
  src += n_per_row;
2838
  qrow += row_size;
 
2843
 
2844
  // ====================== 6-bit (de)-quantization
2845
 
2846
+ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
2847
  assert(k % QK_K == 0);
2848
+ const int64_t nb = k / QK_K;
2849
 
2850
  int8_t L[QK_K];
2851
  float scales[QK_K/16];
 
2925
  }
2926
  }
2927
 
2928
+ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
2929
  assert(k % QK_K == 0);
2930
+ const int64_t nb = k / QK_K;
2931
 
2932
  for (int i = 0; i < nb; i++) {
2933
 
 
2972
  }
2973
  }
2974
 
2975
+ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
2976
  assert(k % QK_K == 0);
2977
  block_q6_K * restrict y = vy;
2978
  quantize_row_q6_K_reference(x, y, k);
2979
  }
2980
 
2981
+ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2982
  #if QK_K != 256
2983
  (void)quant_weights;
2984
  quantize_row_q6_K_reference(x, y, n_per_row);
2985
  #else
2986
  assert(n_per_row % QK_K == 0);
2987
+ const int64_t nb = n_per_row / QK_K;
2988
 
2989
  int8_t L[QK_K];
2990
  float scales[QK_K/16];
 
3067
  #endif
3068
  }
3069
 
3070
+ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3071
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3072
  if (!quant_weights) {
3073
+ quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
3074
  }
3075
  else {
3076
  char * qrow = (char *)dst;
3077
+ for (int64_t row = 0; row < nrow; ++row) {
3078
  quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
3079
  src += n_per_row;
3080
  qrow += row_size;
 
3083
  return nrow * row_size;
3084
  }
3085
 
3086
+ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
3087
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
3088
 
3089
  if (!quant_weights) {
 
3098
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3099
  float sigma2 = sum_x2/n_per_row;
3100
 
3101
+ const int64_t nb = n_per_row/QK4_0;
3102
  for (int ib = 0; ib < nb; ++ib) {
3103
  const float * xb = x + QK4_0 * ib;
3104
  const float * qw = quant_weights + QK4_0 * ib;
 
3111
  }
3112
  }
3113
 
3114
+ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3115
  if (!quant_weights) {
3116
+ quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
3117
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3118
  }
3119
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3120
  char * qrow = (char *)dst;
3121
+ for (int64_t row = 0; row < nrow; ++row) {
3122
  quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
3123
  src += n_per_row;
3124
  qrow += row_size;
 
3126
  return nrow * row_size;
3127
  }
3128
 
3129
+ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
3130
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
3131
 
3132
  if (!quant_weights) {
 
3141
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3142
  float sigma2 = sum_x2/n_per_row;
3143
 
3144
+ const int64_t nb = n_per_row/QK4_1;
3145
  for (int ib = 0; ib < nb; ++ib) {
3146
  const float * xb = x + QK4_1 * ib;
3147
  const float * qw = quant_weights + QK4_1 * ib;
 
3156
  }
3157
  }
3158
 
3159
+ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3160
  if (!quant_weights) {
3161
+ quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
3162
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3163
  }
3164
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3165
  char * qrow = (char *)dst;
3166
+ for (int64_t row = 0; row < nrow; ++row) {
3167
  quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
3168
  src += n_per_row;
3169
  qrow += row_size;
 
3171
  return nrow * row_size;
3172
  }
3173
 
3174
+ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
3175
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
3176
 
3177
  if (!quant_weights) {
 
3186
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3187
  float sigma2 = sum_x2/n_per_row;
3188
 
3189
+ const int64_t nb = n_per_row/QK5_0;
3190
  for (int ib = 0; ib < nb; ++ib) {
3191
  const float * xb = x + QK5_0 * ib;
3192
  const float * qw = quant_weights + QK5_0 * ib;
 
3210
  }
3211
  }
3212
 
3213
+ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3214
  if (!quant_weights) {
3215
+ quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
3216
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3217
  }
3218
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3219
  char * qrow = (char *)dst;
3220
+ for (int64_t row = 0; row < nrow; ++row) {
3221
  quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
3222
  src += n_per_row;
3223
  qrow += row_size;
 
3225
  return nrow * row_size;
3226
  }
3227
 
3228
+ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
3229
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
3230
 
3231
  if (!quant_weights) {
 
3240
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3241
  float sigma2 = sum_x2/n_per_row;
3242
 
3243
+ const int64_t nb = n_per_row/QK5_1;
3244
  for (int ib = 0; ib < nb; ++ib) {
3245
  const float * xb = x + QK5_1 * ib;
3246
  const float * qw = quant_weights + QK5_1 * ib;
 
3263
  }
3264
  }
3265
 
3266
+ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3267
  if (!quant_weights) {
3268
+ quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
3269
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3270
  }
3271
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3272
  char * qrow = (char *)dst;
3273
+ for (int64_t row = 0; row < nrow; ++row) {
3274
  quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
3275
  src += n_per_row;
3276
  qrow += row_size;
 
3278
  return nrow * row_size;
3279
  }
3280
 
3281
+ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3282
  (void)quant_weights; // not used
3283
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3284
+ quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
3285
  return nrow * row_size;
3286
  }
3287
 
3288
  // ====================== "True" 2-bit (de)-quantization
3289
 
3290
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
3291
  assert(k % QK_K == 0);
3292
+ const int64_t nb = k / QK_K;
3293
 
3294
  uint32_t aux32[2];
3295
  const uint8_t * aux8 = (const uint8_t *)aux32;
 
3315
 
3316
  // ====================== 2.3125 bpw (de)-quantization
3317
 
3318
+ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
3319
  assert(k % QK_K == 0);
3320
+ const int64_t nb = k / QK_K;
3321
 
3322
  float db[2];
3323
 
 
3342
 
3343
  // ====================== 2.5625 bpw (de)-quantization
3344
 
3345
+ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
3346
  assert(k % QK_K == 0);
3347
+ const int64_t nb = k / QK_K;
3348
 
3349
  float db[2];
3350
 
 
3374
 
3375
  // ====================== 3.0625 bpw (de)-quantization
3376
 
3377
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
3378
  assert(k % QK_K == 0);
3379
+ const int64_t nb = k / QK_K;
3380
 
3381
  uint32_t aux32;
3382
 
 
3406
 
3407
  // ====================== 3.3125 bpw (de)-quantization
3408
 
3409
+ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
3410
  assert(k % QK_K == 0);
3411
+ const int64_t nb = k / QK_K;
3412
 
3413
  for (int i = 0; i < nb; i++) {
3414
 
 
3449
 
3450
  // ====================== 1.5625 bpw (de)-quantization
3451
 
3452
+ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
3453
  assert(k % QK_K == 0);
3454
+ const int64_t nb = k / QK_K;
3455
 
3456
  for (int i = 0; i < nb; i++) {
3457
 
 
3474
  }
3475
  }
3476
 
3477
+ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
3478
  assert(k % QK_K == 0);
3479
+ const int64_t nb = k / QK_K;
3480
 
3481
  float delta[4];
3482
  uint16_t idx[4];
 
3535
 
3536
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
3537
 
3538
+ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
3539
  assert(k % QK4_NL == 0);
3540
+ const int64_t nb = k / QK4_NL;
3541
 
3542
  for (int i = 0; i < nb; i++) {
3543
 
 
3553
  }
3554
  }
3555
 
3556
+ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
3557
  assert(k % QK_K == 0);
3558
  #if QK_K == 64
3559
  dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
3560
  #else
3561
+ const int64_t nb = k / QK_K;
3562
 
3563
  for (int i = 0; i < nb; i++) {
3564
 
 
3582
 
3583
  //===================================== Q8_K ==============================================
3584
 
3585
+ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
3586
  assert(k % QK_K == 0);
3587
+ const int64_t nb = k / QK_K;
3588
 
3589
  for (int i = 0; i < nb; i++) {
3590
 
 
3621
  }
3622
  }
3623
 
3624
+ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
3625
  assert(k % QK_K == 0);
3626
+ const int64_t nb = k / QK_K;
3627
 
3628
  for (int i = 0; i < nb; i++) {
3629
  for (int j = 0; j < QK_K; ++j) {
 
3632
  }
3633
  }
3634
 
3635
+ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
3636
  quantize_row_q8_K_reference(x, y, k);
3637
  }
3638
 
 
10648
  return grid_index;
10649
  }
10650
 
10651
+ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
10652
 
10653
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
10654
 
 
10664
 
10665
  const int kMaxQ = 3;
10666
 
10667
+ const int64_t nbl = n/QK_K;
10668
 
10669
  block_iq2_xxs * y = vy;
10670
 
 
10821
  }
10822
  }
10823
 
10824
+ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
10825
 
10826
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
10827
 
 
10837
 
10838
  const int kMaxQ = 3;
10839
 
10840
+ const int64_t nbl = n/QK_K;
10841
 
10842
  block_iq2_xs * y = vy;
10843
 
 
11001
  }
11002
  }
11003
 
11004
+ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11005
  GGML_ASSERT(n_per_row%QK_K == 0);
11006
+ int64_t nblock = n_per_row/QK_K;
11007
  char * qrow = (char *)dst;
11008
+ for (int64_t row = 0; row < nrow; ++row) {
11009
  quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
11010
  src += n_per_row;
11011
  qrow += nblock*sizeof(block_iq2_xxs);
 
11013
  return nrow * nblock * sizeof(block_iq2_xxs);
11014
  }
11015
 
11016
+ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11017
  GGML_ASSERT(n_per_row%QK_K == 0);
11018
+ int64_t nblock = n_per_row/QK_K;
11019
  char * qrow = (char *)dst;
11020
+ for (int64_t row = 0; row < nrow; ++row) {
11021
  quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
11022
  src += n_per_row;
11023
  qrow += nblock*sizeof(block_iq2_xs);
 
11242
  return grid_index;
11243
  }
11244
 
11245
+ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
11246
  const float * restrict quant_weights) {
11247
 
11248
  const int gindex = iq3_data_index(grid_size);
 
11259
 
11260
  const int kMaxQ = 8;
11261
 
11262
+ const int64_t nbl = n/QK_K;
11263
 
11264
  ggml_fp16_t * dh;
11265
  uint8_t * qs;
 
11455
  }
11456
  }
11457
 
11458
+ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11459
  GGML_ASSERT(n_per_row%QK_K == 0);
11460
+ int64_t nblock = n_per_row/QK_K;
11461
  char * qrow = (char *)dst;
11462
+ for (int64_t row = 0; row < nrow; ++row) {
11463
  quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
11464
  src += n_per_row;
11465
  qrow += nblock*sizeof(block_iq3_xxs);
 
11467
  return nrow * nblock * sizeof(block_iq3_xxs);
11468
  }
11469
 
11470
+ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
11471
  assert(k % QK_K == 0);
11472
  block_iq3_xxs * restrict y = vy;
11473
  quantize_row_iq3_xxs_reference(x, y, k);
11474
  }
11475
 
11476
+ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
11477
  assert(k % QK_K == 0);
11478
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
11479
  }
 
11504
 
11505
  const int kMaxQ = 8;
11506
 
11507
+ const int64_t nbl = n/QK_K;
11508
 
11509
  block_iq3_s * y = vy;
11510
 
 
11661
  }
11662
 
11663
  #define IQ3S_BLOCK_SIZE 32
11664
+ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11665
  GGML_ASSERT(n_per_row%QK_K == 0);
11666
+ int64_t nblock = n_per_row/QK_K;
11667
  float scales[QK_K/IQ3S_BLOCK_SIZE];
11668
  float weight[IQ3S_BLOCK_SIZE];
11669
  float xval[IQ3S_BLOCK_SIZE];
 
11674
  bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
11675
  uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
11676
  char * qrow = (char *)dst;
11677
+ for (int64_t row = 0; row < nrow; ++row) {
11678
  quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
11679
  scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
11680
  src += n_per_row;
 
11683
  return nrow * nblock * sizeof(block_iq3_s);
11684
  }
11685
 
11686
+ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
11687
  assert(k % QK_K == 0);
11688
  block_iq3_s * restrict y = vy;
11689
  quantize_row_iq3_s_reference(x, y, k);
11690
  }
11691
 
11692
+ void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
11693
  assert(k % QK_K == 0);
11694
  quantize_iq3_s(x, y, 1, k, NULL);
11695
  }
 
11822
 
11823
  #define IQ1S_BLOCK_SIZE 32
11824
  #define IQ1M_BLOCK_SIZE 16
11825
+ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
11826
  float * scales,
11827
  float * weight,
11828
  float * sumx,
 
11846
 
11847
  block_iq1_s * y = vy;
11848
 
11849
+ const int64_t nbl = n/QK_K;
11850
 
11851
  const int block_size = IQ1S_BLOCK_SIZE;
11852
 
 
11980
  }
11981
  }
11982
 
11983
+ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11984
  GGML_ASSERT(n_per_row%QK_K == 0);
11985
  float scales[QK_K/IQ1S_BLOCK_SIZE];
11986
  float weight[IQ1S_BLOCK_SIZE];
 
11990
  float pairs[2*IQ1S_BLOCK_SIZE];
11991
  uint16_t index[IQ1S_BLOCK_SIZE/8];
11992
  int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11993
+ int64_t nblock = n_per_row/QK_K;
11994
  char * qrow = (char *)dst;
11995
+ for (int64_t row = 0; row < nrow; ++row) {
11996
  quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
11997
  src += n_per_row;
11998
  qrow += nblock*sizeof(block_iq1_s);
 
12000
  return nrow * nblock * sizeof(block_iq1_s);
12001
  }
12002
 
12003
+ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
12004
  float * scales,
12005
  float * weight,
12006
  float * pairs,
 
12022
 
12023
  block_iq1_m * y = vy;
12024
 
12025
+ const int64_t nbl = n/QK_K;
12026
 
12027
  const int block_size = IQ1M_BLOCK_SIZE;
12028
 
 
12265
  }
12266
  }
12267
 
12268
+ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12269
  GGML_ASSERT(n_per_row%QK_K == 0);
12270
  float scales[QK_K/IQ1M_BLOCK_SIZE];
12271
  float weight[IQ1M_BLOCK_SIZE];
 
12273
  float pairs[2*IQ1M_BLOCK_SIZE];
12274
  uint16_t index[IQ1M_BLOCK_SIZE/8];
12275
  int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
12276
+ int64_t nblock = n_per_row/QK_K;
12277
  char * qrow = (char *)dst;
12278
+ for (int64_t row = 0; row < nrow; ++row) {
12279
  quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
12280
  src += n_per_row;
12281
  qrow += nblock*sizeof(block_iq1_m);
 
12407
  }
12408
  }
12409
 
12410
+ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12411
  GGML_ASSERT(n_per_row%QK4_NL == 0);
12412
+ int64_t nblock = n_per_row/QK4_NL;
12413
  char * qrow = (char *)dst;
12414
  uint8_t L[QK4_NL];
12415
  float weight[QK4_NL];
12416
  uint16_t unused_h;
12417
  uint8_t * unused_l = NULL;
12418
  float scale;
12419
+ for (int64_t row = 0; row < nrow; ++row) {
12420
  block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
12421
  for (int ibl = 0; ibl < nblock; ++ibl) {
12422
  const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
 
12429
  return nrow * nblock * sizeof(block_iq4_nl);
12430
  }
12431
 
12432
+ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) {
12433
  GGML_ASSERT(k%QK4_NL == 0);
12434
+ int64_t nblock = k/QK4_NL;
12435
  uint8_t L[QK4_NL];
12436
  float weight[QK4_NL];
12437
  uint16_t unused_h;
 
12444
  }
12445
  }
12446
 
12447
+ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
12448
  assert(k % QK4_NL == 0);
12449
  quantize_row_iq4_nl(x, y, k);
12450
  }
12451
 
12452
+ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12453
  #if QK_K == 64
12454
  return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
12455
  #else
12456
  GGML_ASSERT(n_per_row%QK_K == 0);
12457
+ int64_t nblock = n_per_row/QK_K;
12458
  char * qrow = (char *)dst;
12459
  uint8_t L[QK_K];
12460
  float weight[32];
12461
  float scales[QK_K/32];
12462
+ for (int64_t row = 0; row < nrow; ++row) {
12463
  block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
12464
  for (int ibl = 0; ibl < nblock; ++ibl) {
12465
  const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
 
12473
  #endif
12474
  }
12475
 
12476
+ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
12477
  assert(k % QK_K == 0);
12478
  block_iq4_xs * restrict y = vy;
12479
  quantize_row_iq4_xs_reference(x, y, k);
12480
  }
12481
 
12482
+ void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
12483
  assert(k % QK_K == 0);
12484
  quantize_iq4_xs(x, y, 1, k, NULL);
12485
  }
12486
 
12487
  // =============================== 2.5625 bpw
12488
 
12489
+ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
12490
 
12491
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
12492
 
 
12501
 
12502
  const int kMaxQ = 3;
12503
 
12504
+ const int64_t nbl = n/QK_K;
12505
 
12506
  block_iq2_s * y = vy;
12507
 
 
12654
  }
12655
  }
12656
 
12657
+ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12658
  GGML_ASSERT(n_per_row%QK_K == 0);
12659
+ int64_t nblock = n_per_row/QK_K;
12660
  char * qrow = (char *)dst;
12661
+ for (int64_t row = 0; row < nrow; ++row) {
12662
  quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
12663
  src += n_per_row;
12664
  qrow += nblock*sizeof(block_iq2_s);
 
12666
  return nrow * nblock * sizeof(block_iq2_s);
12667
  }
12668
 
12669
+ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
12670
  assert(k % QK_K == 0);
12671
  quantize_iq2_s(x, y, 1, k, NULL);
12672
  }
12673
 
12674
+ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
12675
  assert(k % QK_K == 0);
12676
  block_iq2_s * restrict y = vy;
12677
  quantize_row_iq2_s_reference(x, y, k);
ggml-quants.h CHANGED
@@ -12,70 +12,70 @@ extern "C" {
12
  #endif
13
 
14
  // Quantization
15
- void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
16
- void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
17
- void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
18
- void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
19
- void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
20
- void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
21
-
22
- void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
23
- void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
24
- void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
25
- void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
26
- void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
27
- void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
28
-
29
- void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
30
- void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
31
- void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
32
- void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
33
- void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
34
-
35
- void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
36
- void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
37
- void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
38
- void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
39
- void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
40
- void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
41
-
42
- void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
43
- void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
44
- void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
45
- void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
46
- void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
47
- void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
48
-
49
- void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
50
- void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
51
- void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
52
- void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
53
- void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
54
 
55
  // Dequantization
56
- void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
57
- void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
58
- void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
59
- void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
60
- void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
61
- //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
62
-
63
- void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
64
- void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
65
- void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
66
- void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
67
- void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
68
- void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
69
-
70
- void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
71
- void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
72
- void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
73
- void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
74
- void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
75
- void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
76
- void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
77
- void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
78
- void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
79
 
80
  // Dot product
81
  void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -101,26 +101,26 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
101
  void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
102
 
103
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
104
- size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
105
- size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
106
- size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
107
- size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
108
- size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
109
- size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
110
- size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
111
- size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
112
- size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
113
-
114
- size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
115
- size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
116
- size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
117
- size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
118
- size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
119
- size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
120
- size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
121
- size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
122
- size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
123
- size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
124
 
125
  void iq2xs_init_impl(enum ggml_type type);
126
  void iq2xs_free_impl(enum ggml_type type);
 
12
  #endif
13
 
14
  // Quantization
15
+ void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
16
+ void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
17
+ void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
18
+ void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
19
+ void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
20
+ void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
21
+
22
+ void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
23
+ void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
24
+ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
25
+ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
26
+ void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
27
+ void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
28
+
29
+ void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
30
+ void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
31
+ void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
32
+ void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
33
+ void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
34
+
35
+ void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
36
+ void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
37
+ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
38
+ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
39
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
40
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
41
+
42
+ void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
43
+ void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
44
+ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
45
+ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
46
+ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
47
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
48
+
49
+ void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
50
+ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
51
+ void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
52
+ void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
53
+ void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
54
 
55
  // Dequantization
56
+ void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
57
+ void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
58
+ void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
59
+ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
60
+ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
61
+ //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
62
+
63
+ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
64
+ void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
65
+ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
66
+ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
67
+ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
68
+ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
69
+
70
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
71
+ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
72
+ void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
73
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
74
+ void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
75
+ void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
76
+ void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
77
+ void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
78
+ void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
79
 
80
  // Dot product
81
  void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
101
  void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
102
 
103
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
104
+ size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
105
+ size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
106
+ size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
107
+ size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
108
+ size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
109
+ size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
110
+ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
111
+ size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
112
+ size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
113
+
114
+ size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
115
+ size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
116
+ size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
117
+ size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
118
+ size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
119
+ size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
120
+ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
121
+ size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
122
+ size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
123
+ size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
124
 
125
  void iq2xs_init_impl(enum ggml_type type);
126
  void iq2xs_free_impl(enum ggml_type type);
ggml.c CHANGED
@@ -338,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
338
  return GGML_FP32_TO_FP16(x);
339
  }
340
 
341
- void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
342
- for (int i = 0; i < n; i++) {
343
  y[i] = GGML_FP16_TO_FP32(x[i]);
344
  }
345
  }
346
 
347
- void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
348
- int i = 0;
349
  #if defined(__F16C__)
350
  for (; i + 7 < n; i += 8) {
351
  __m256 x_vec = _mm256_loadu_ps(x + i);
@@ -20331,11 +20331,11 @@ size_t ggml_quantize_chunk(
20331
  enum ggml_type type,
20332
  const float * src,
20333
  void * dst,
20334
- int start,
20335
- int nrows,
20336
- int n_per_row,
20337
  const float * imatrix) {
20338
- const int n = nrows * n_per_row;
20339
 
20340
  if (ggml_quantize_requires_imatrix(type)) {
20341
  GGML_ASSERT(imatrix != NULL);
 
338
  return GGML_FP32_TO_FP16(x);
339
  }
340
 
341
+ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
342
+ for (int64_t i = 0; i < n; i++) {
343
  y[i] = GGML_FP16_TO_FP32(x[i]);
344
  }
345
  }
346
 
347
+ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
348
+ int64_t i = 0;
349
  #if defined(__F16C__)
350
  for (; i + 7 < n; i += 8) {
351
  __m256 x_vec = _mm256_loadu_ps(x + i);
 
20331
  enum ggml_type type,
20332
  const float * src,
20333
  void * dst,
20334
+ int64_t start,
20335
+ int64_t nrows,
20336
+ int64_t n_per_row,
20337
  const float * imatrix) {
20338
+ const int64_t n = (int64_t) nrows * n_per_row;
20339
 
20340
  if (ggml_quantize_requires_imatrix(type)) {
20341
  GGML_ASSERT(imatrix != NULL);
ggml.h CHANGED
@@ -332,8 +332,8 @@ extern "C" {
332
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
 
335
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
336
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
337
 
338
  struct ggml_object;
339
  struct ggml_context;
@@ -2210,9 +2210,9 @@ extern "C" {
2210
  enum ggml_type type,
2211
  const float * src,
2212
  void * dst,
2213
- int start,
2214
- int nrows,
2215
- int n_per_row,
2216
  const float * imatrix);
2217
 
2218
  //
@@ -2377,8 +2377,8 @@ extern "C" {
2377
  #else
2378
  #define GGML_RESTRICT restrict
2379
  #endif
2380
- typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
2381
- typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
2382
  typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2383
  const void * GGML_RESTRICT y, size_t by, int nrc);
2384
 
 
332
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
 
335
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
337
 
338
  struct ggml_object;
339
  struct ggml_context;
 
2210
  enum ggml_type type,
2211
  const float * src,
2212
  void * dst,
2213
+ int64_t start,
2214
+ int64_t nrows,
2215
+ int64_t n_per_row,
2216
  const float * imatrix);
2217
 
2218
  //
 
2377
  #else
2378
  #define GGML_RESTRICT restrict
2379
  #endif
2380
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2381
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2382
  typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2383
  const void * GGML_RESTRICT y, size_t by, int nrc);
2384