Kawrakow ikawrakow ggerganov commited on
Commit
2b9bb9e
·
unverified ·
1 Parent(s): f18f386

Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

Browse files

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <[email protected]>

---------

Co-authored-by: Iwan Kawrakow <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (7) hide show
  1. ggml-cuda.cu +357 -1
  2. ggml-metal.m +31 -6
  3. ggml-metal.metal +487 -0
  4. ggml-quants.c +767 -8
  5. ggml-quants.h +14 -0
  6. ggml.c +31 -0
  7. ggml.h +2 -0
ggml-cuda.cu CHANGED
@@ -523,6 +523,17 @@ typedef struct {
523
  } block_iq2_xs;
524
  static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
525
 
 
 
 
 
 
 
 
 
 
 
 
526
  #define QR3_XXS 8
527
  #define QI3_XXS (QK_K / (4*QR3_XXS))
528
  typedef struct {
@@ -1689,6 +1700,265 @@ static const __device__ uint64_t iq2xs_grid[512] = {
1689
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
1690
  };
1691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1692
  static const __device__ uint32_t iq3xxs_grid[256] = {
1693
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
1694
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
@@ -2037,6 +2307,27 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
2037
 
2038
  }
2039
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2040
  template<typename dst_t>
2041
  static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
2042
 
@@ -4800,6 +5091,54 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
4800
  #endif
4801
  }
4802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4803
  static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
4804
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4805
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -6996,6 +7335,12 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
6996
  dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
6997
  }
6998
 
 
 
 
 
 
 
6999
  template<typename dst_t>
7000
  static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
7001
  const int nb = k / QK_K;
@@ -7057,6 +7402,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
7057
  return dequantize_row_iq2_xxs_cuda;
7058
  case GGML_TYPE_IQ2_XS:
7059
  return dequantize_row_iq2_xs_cuda;
 
 
7060
  case GGML_TYPE_IQ3_XXS:
7061
  return dequantize_row_iq3_xxs_cuda;
7062
  case GGML_TYPE_IQ1_S:
@@ -7098,6 +7445,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
7098
  return dequantize_row_iq2_xxs_cuda;
7099
  case GGML_TYPE_IQ2_XS:
7100
  return dequantize_row_iq2_xs_cuda;
 
 
7101
  case GGML_TYPE_IQ3_XXS:
7102
  return dequantize_row_iq3_xxs_cuda;
7103
  case GGML_TYPE_IQ1_S:
@@ -8848,6 +9197,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8848
  case GGML_TYPE_Q6_K:
8849
  case GGML_TYPE_IQ2_XXS:
8850
  case GGML_TYPE_IQ2_XS:
 
8851
  case GGML_TYPE_IQ3_XXS:
8852
  case GGML_TYPE_IQ1_S:
8853
  case GGML_TYPE_IQ4_NL:
@@ -8874,6 +9224,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8874
  case GGML_TYPE_Q5_K:
8875
  case GGML_TYPE_IQ2_XXS:
8876
  case GGML_TYPE_IQ2_XS:
 
8877
  case GGML_TYPE_IQ3_XXS:
8878
  case GGML_TYPE_IQ1_S:
8879
  case GGML_TYPE_IQ4_NL:
@@ -8971,6 +9322,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
8971
  mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
8972
  (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8973
  break;
 
 
 
 
8974
  case GGML_TYPE_IQ3_XXS:
8975
  mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
8976
  (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
@@ -11710,7 +12065,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11710
  }
11711
  ggml_type a_type = a->type;
11712
  if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
11713
- a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S) {
 
11714
  if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
11715
  return false;
11716
  }
 
523
  } block_iq2_xs;
524
  static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
525
 
526
+ // 2.5625 bpw quants
527
+ #define QR2_S 8
528
+ #define QI2_S (QK_K / (4*QR2_S))
529
+ typedef struct {
530
+ half d;
531
+ uint8_t qs[QK_K/4];
532
+ uint8_t qh[QK_K/32];
533
+ uint8_t scales[QK_K/32];
534
+ } block_iq2_s;
535
+ static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
536
+
537
  #define QR3_XXS 8
538
  #define QI3_XXS (QK_K / (4*QR3_XXS))
539
  typedef struct {
 
1700
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
1701
  };
1702
 
1703
+ static const __device__ uint64_t iq2s_grid[1024] = {
1704
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1705
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
1706
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
1707
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
1708
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
1709
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
1710
+ 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
1711
+ 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
1712
+ 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
1713
+ 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
1714
+ 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
1715
+ 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
1716
+ 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
1717
+ 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
1718
+ 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
1719
+ 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
1720
+ 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
1721
+ 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
1722
+ 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
1723
+ 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
1724
+ 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
1725
+ 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
1726
+ 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
1727
+ 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
1728
+ 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
1729
+ 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
1730
+ 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
1731
+ 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
1732
+ 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
1733
+ 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
1734
+ 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
1735
+ 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
1736
+ 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
1737
+ 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
1738
+ 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
1739
+ 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
1740
+ 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
1741
+ 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
1742
+ 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
1743
+ 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
1744
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
1745
+ 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
1746
+ 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
1747
+ 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
1748
+ 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
1749
+ 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
1750
+ 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
1751
+ 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
1752
+ 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
1753
+ 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
1754
+ 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
1755
+ 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
1756
+ 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
1757
+ 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
1758
+ 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
1759
+ 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
1760
+ 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
1761
+ 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
1762
+ 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
1763
+ 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
1764
+ 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
1765
+ 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
1766
+ 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
1767
+ 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
1768
+ 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
1769
+ 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
1770
+ 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
1771
+ 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
1772
+ 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
1773
+ 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
1774
+ 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
1775
+ 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
1776
+ 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
1777
+ 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
1778
+ 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
1779
+ 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
1780
+ 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
1781
+ 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
1782
+ 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
1783
+ 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
1784
+ 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
1785
+ 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
1786
+ 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
1787
+ 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
1788
+ 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
1789
+ 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
1790
+ 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
1791
+ 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
1792
+ 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
1793
+ 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
1794
+ 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
1795
+ 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
1796
+ 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
1797
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
1798
+ 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
1799
+ 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
1800
+ 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
1801
+ 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
1802
+ 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
1803
+ 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
1804
+ 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
1805
+ 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
1806
+ 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
1807
+ 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
1808
+ 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
1809
+ 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
1810
+ 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
1811
+ 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
1812
+ 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
1813
+ 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
1814
+ 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
1815
+ 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
1816
+ 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
1817
+ 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
1818
+ 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
1819
+ 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
1820
+ 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
1821
+ 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
1822
+ 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
1823
+ 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
1824
+ 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
1825
+ 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
1826
+ 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
1827
+ 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
1828
+ 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
1829
+ 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
1830
+ 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
1831
+ 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
1832
+ 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
1833
+ 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
1834
+ 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
1835
+ 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
1836
+ 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
1837
+ 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
1838
+ 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
1839
+ 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
1840
+ 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
1841
+ 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
1842
+ 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
1843
+ 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
1844
+ 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
1845
+ 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
1846
+ 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
1847
+ 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
1848
+ 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
1849
+ 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
1850
+ 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
1851
+ 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
1852
+ 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
1853
+ 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
1854
+ 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
1855
+ 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
1856
+ 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
1857
+ 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
1858
+ 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
1859
+ 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
1860
+ 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
1861
+ 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
1862
+ 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
1863
+ 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
1864
+ 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
1865
+ 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
1866
+ 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
1867
+ 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
1868
+ 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
1869
+ 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
1870
+ 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
1871
+ 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
1872
+ 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
1873
+ 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
1874
+ 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
1875
+ 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
1876
+ 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
1877
+ 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
1878
+ 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
1879
+ 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
1880
+ 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
1881
+ 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
1882
+ 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
1883
+ 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
1884
+ 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
1885
+ 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
1886
+ 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
1887
+ 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
1888
+ 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
1889
+ 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
1890
+ 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
1891
+ 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
1892
+ 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
1893
+ 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
1894
+ 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
1895
+ 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
1896
+ 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
1897
+ 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
1898
+ 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
1899
+ 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
1900
+ 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
1901
+ 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
1902
+ 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
1903
+ 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
1904
+ 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
1905
+ 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
1906
+ 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
1907
+ 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
1908
+ 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
1909
+ 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
1910
+ 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
1911
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
1912
+ 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
1913
+ 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
1914
+ 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
1915
+ 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
1916
+ 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
1917
+ 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
1918
+ 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
1919
+ 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
1920
+ 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
1921
+ 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
1922
+ 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
1923
+ 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
1924
+ 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
1925
+ 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
1926
+ 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
1927
+ 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
1928
+ 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
1929
+ 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
1930
+ 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
1931
+ 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
1932
+ 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
1933
+ 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
1934
+ 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
1935
+ 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
1936
+ 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
1937
+ 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
1938
+ 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
1939
+ 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
1940
+ 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
1941
+ 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
1942
+ 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
1943
+ 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
1944
+ 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
1945
+ 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
1946
+ 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
1947
+ 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
1948
+ 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
1949
+ 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
1950
+ 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
1951
+ 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
1952
+ 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
1953
+ 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
1954
+ 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
1955
+ 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
1956
+ 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
1957
+ 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
1958
+ 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
1959
+ 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
1960
+ };
1961
+
1962
  static const __device__ uint32_t iq3xxs_grid[256] = {
1963
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
1964
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
 
2307
 
2308
  }
2309
 
2310
+ template<typename dst_t>
2311
+ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
2312
+
2313
+ const int i = blockIdx.x;
2314
+ const block_iq2_s * x = (const block_iq2_s *) vx;
2315
+
2316
+ const int tid = threadIdx.x;
2317
+ #if QK_K == 256
2318
+ const int il = tid/8; // 0...3
2319
+ const int ib = tid%8; // 0...7
2320
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
2321
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
2322
+ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
2323
+ const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
2324
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
2325
+ #else
2326
+ assert(false);
2327
+ #endif
2328
+
2329
+ }
2330
+
2331
  template<typename dst_t>
2332
  static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
2333
 
 
5091
  #endif
5092
  }
5093
 
5094
+ // TODO
5095
+ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
5096
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
5097
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
5098
+ #if QK_K == 256
5099
+ const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
5100
+
5101
+ const int ib32 = iqs;
5102
+ const int8_t * q8 = bq8_1[ib32].qs;
5103
+ const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
5104
+ const uint8_t ls1 = bq2->scales[ib32] & 0xf;
5105
+ const uint8_t ls2 = bq2->scales[ib32] >> 4;
5106
+ int sumi1 = 0;
5107
+ for (int l = 0; l < 2; ++l) {
5108
+ const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
5109
+ const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
5110
+ const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
5111
+ const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
5112
+ const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
5113
+ sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
5114
+ sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
5115
+ q8 += 8;
5116
+ }
5117
+ int sumi2 = 0;
5118
+ for (int l = 2; l < 4; ++l) {
5119
+ const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
5120
+ const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
5121
+ const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
5122
+ const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
5123
+ const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
5124
+ sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
5125
+ sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
5126
+ q8 += 8;
5127
+ }
5128
+ const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
5129
+ return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
5130
+ #else
5131
+ (void) ksigns64;
5132
+ assert(false);
5133
+ return 0.f;
5134
+ #endif
5135
+ #else
5136
+ (void) ksigns64;
5137
+ assert(false);
5138
+ return 0.f;
5139
+ #endif
5140
+ }
5141
+
5142
  static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
5143
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
5144
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 
7335
  dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
7336
  }
7337
 
7338
+ template<typename dst_t>
7339
+ static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
7340
+ const int nb = k / QK_K;
7341
+ dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
7342
+ }
7343
+
7344
  template<typename dst_t>
7345
  static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
7346
  const int nb = k / QK_K;
 
7402
  return dequantize_row_iq2_xxs_cuda;
7403
  case GGML_TYPE_IQ2_XS:
7404
  return dequantize_row_iq2_xs_cuda;
7405
+ case GGML_TYPE_IQ2_S:
7406
+ return dequantize_row_iq2_s_cuda;
7407
  case GGML_TYPE_IQ3_XXS:
7408
  return dequantize_row_iq3_xxs_cuda;
7409
  case GGML_TYPE_IQ1_S:
 
7445
  return dequantize_row_iq2_xxs_cuda;
7446
  case GGML_TYPE_IQ2_XS:
7447
  return dequantize_row_iq2_xs_cuda;
7448
+ case GGML_TYPE_IQ2_S:
7449
+ return dequantize_row_iq2_s_cuda;
7450
  case GGML_TYPE_IQ3_XXS:
7451
  return dequantize_row_iq3_xxs_cuda;
7452
  case GGML_TYPE_IQ1_S:
 
9197
  case GGML_TYPE_Q6_K:
9198
  case GGML_TYPE_IQ2_XXS:
9199
  case GGML_TYPE_IQ2_XS:
9200
+ case GGML_TYPE_IQ2_S:
9201
  case GGML_TYPE_IQ3_XXS:
9202
  case GGML_TYPE_IQ1_S:
9203
  case GGML_TYPE_IQ4_NL:
 
9224
  case GGML_TYPE_Q5_K:
9225
  case GGML_TYPE_IQ2_XXS:
9226
  case GGML_TYPE_IQ2_XS:
9227
+ case GGML_TYPE_IQ2_S:
9228
  case GGML_TYPE_IQ3_XXS:
9229
  case GGML_TYPE_IQ1_S:
9230
  case GGML_TYPE_IQ4_NL:
 
9322
  mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
9323
  (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
9324
  break;
9325
+ case GGML_TYPE_IQ2_S:
9326
+ mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
9327
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
9328
+ break;
9329
  case GGML_TYPE_IQ3_XXS:
9330
  mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
9331
  (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 
12065
  }
12066
  ggml_type a_type = a->type;
12067
  if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
12068
+ a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
12069
+ a_type == GGML_TYPE_IQ2_S) {
12070
  if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
12071
  return false;
12072
  }
ggml-metal.m CHANGED
@@ -62,6 +62,7 @@ enum ggml_metal_kernel_type {
62
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
63
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
64
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
 
65
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
66
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
67
  GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
@@ -87,6 +88,7 @@ enum ggml_metal_kernel_type {
87
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
88
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
89
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
 
90
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
91
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
92
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
@@ -108,6 +110,7 @@ enum ggml_metal_kernel_type {
108
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
109
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
110
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
 
111
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
112
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
113
  GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
@@ -126,6 +129,7 @@ enum ggml_metal_kernel_type {
126
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
127
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
128
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
 
129
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
130
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
131
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
@@ -144,6 +148,7 @@ enum ggml_metal_kernel_type {
144
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
145
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
146
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
 
147
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
148
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
149
  GGML_METAL_KERNEL_TYPE_ROPE_F32,
@@ -458,6 +463,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
458
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
459
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
460
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true);
 
461
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
462
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
463
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
@@ -483,6 +489,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
483
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
484
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
485
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction);
 
486
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
487
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
488
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
@@ -504,6 +511,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
504
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
505
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
506
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction);
 
507
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
508
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
509
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
@@ -522,6 +530,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
522
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
523
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
524
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm);
 
525
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
526
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
527
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
@@ -540,6 +549,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
540
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
541
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
542
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm);
 
543
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
544
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
545
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
@@ -1358,6 +1368,7 @@ static bool ggml_metal_graph_compute(
1358
  case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
1359
  case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
1360
  case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
 
1361
  case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
1362
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
1363
  default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
@@ -1500,6 +1511,12 @@ static bool ggml_metal_graph_compute(
1500
  nth1 = 16;
1501
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
1502
  } break;
 
 
 
 
 
 
1503
  case GGML_TYPE_IQ1_S:
1504
  {
1505
  nth0 = 4;
@@ -1544,9 +1561,9 @@ static bool ggml_metal_graph_compute(
1544
  [encoder setBytes:&r2 length:sizeof(r2) atIndex:17];
1545
  [encoder setBytes:&r3 length:sizeof(r3) atIndex:18];
1546
 
1547
- if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
1548
- src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
1549
- src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S) { // || src0t == GGML_TYPE_Q4_K) {
1550
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1551
  }
1552
  else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
@@ -1658,6 +1675,7 @@ static bool ggml_metal_graph_compute(
1658
  case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
1659
  case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
1660
  case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
 
1661
  case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
1662
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
1663
  default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
@@ -1803,6 +1821,12 @@ static bool ggml_metal_graph_compute(
1803
  nth1 = 16;
1804
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
1805
  } break;
 
 
 
 
 
 
1806
  case GGML_TYPE_IQ1_S:
1807
  {
1808
  nth0 = 4;
@@ -1863,9 +1887,9 @@ static bool ggml_metal_graph_compute(
1863
  [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
1864
  }
1865
 
1866
- if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
1867
- src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
1868
- src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S) { // || src2t == GGML_TYPE_Q4_K) {
1869
  [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1870
  }
1871
  else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
@@ -1925,6 +1949,7 @@ static bool ggml_metal_graph_compute(
1925
  case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
1926
  case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
1927
  case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break;
 
1928
  case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
1929
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
1930
  case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
 
62
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
63
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
64
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
65
+ GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,
66
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
67
  GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
68
  GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
 
88
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
89
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
90
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
91
+ GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,
92
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
93
  GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
94
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
 
110
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
111
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
112
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
113
+ GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,
114
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
115
  GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
116
  GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
 
129
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
130
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
131
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
132
+ GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,
133
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
134
  GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
135
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
 
148
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
149
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
150
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
151
+ GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
152
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
153
  GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
154
  GGML_METAL_KERNEL_TYPE_ROPE_F32,
 
463
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
464
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
465
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true);
466
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, get_rows_iq2_s, true);
467
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
468
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
469
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
 
489
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
490
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
491
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction);
492
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction);
493
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
494
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
495
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
 
511
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
512
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
513
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction);
514
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction);
515
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
516
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
517
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
 
530
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
531
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
532
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm);
533
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm);
534
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
535
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
536
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
 
549
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
550
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
551
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm);
552
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm);
553
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
554
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
555
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
 
1368
  case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
1369
  case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
1370
  case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
1371
+ case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break;
1372
  case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
1373
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
1374
  default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
 
1511
  nth1 = 16;
1512
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
1513
  } break;
1514
+ case GGML_TYPE_IQ2_S:
1515
+ {
1516
+ nth0 = 4;
1517
+ nth1 = 16;
1518
+ pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
1519
+ } break;
1520
  case GGML_TYPE_IQ1_S:
1521
  {
1522
  nth0 = 4;
 
1561
  [encoder setBytes:&r2 length:sizeof(r2) atIndex:17];
1562
  [encoder setBytes:&r3 length:sizeof(r3) atIndex:18];
1563
 
1564
+ if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
1565
+ src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
1566
+ src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ2_S) {
1567
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1568
  }
1569
  else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
 
1675
  case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
1676
  case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
1677
  case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
1678
+ case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break;
1679
  case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
1680
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
1681
  default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
 
1821
  nth1 = 16;
1822
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
1823
  } break;
1824
+ case GGML_TYPE_IQ2_S:
1825
+ {
1826
+ nth0 = 4;
1827
+ nth1 = 16;
1828
+ pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline;
1829
+ } break;
1830
  case GGML_TYPE_IQ1_S:
1831
  {
1832
  nth0 = 4;
 
1887
  [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
1888
  }
1889
 
1890
+ if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
1891
+ src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
1892
+ src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ2_S) {
1893
  [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1894
  }
1895
  else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
 
1949
  case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
1950
  case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
1951
  case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break;
1952
+ case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break;
1953
  case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
1954
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
1955
  case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
ggml-metal.metal CHANGED
@@ -2519,6 +2519,14 @@ typedef struct {
2519
  } block_iq2_xs;
2520
  // 74 bytes / block for QK_K = 256, so 2.3125 bpw
2521
 
 
 
 
 
 
 
 
 
2522
  typedef struct {
2523
  half d;
2524
  uint8_t qs[3*QK_K/8];
@@ -3774,6 +3782,265 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
3774
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3775
  };
3776
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3777
  constexpr constant static uint32_t iq3xxs_grid[256] = {
3778
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
3779
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
@@ -4572,6 +4839,139 @@ kernel void kernel_mul_mv_iq3_s_f32(
4572
  kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
4573
  }
4574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4575
  void kernel_mul_mv_iq1_s_f32_impl(
4576
  device const void * src0,
4577
  device const float * src1,
@@ -5188,6 +5588,25 @@ void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 &
5188
  }
5189
  }
5190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5191
  template <typename type4x4>
5192
  void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
5193
  // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
@@ -5762,6 +6181,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
5762
  template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5763
  template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
5764
  template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
 
5765
  template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
5766
  template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
5767
 
@@ -5804,6 +6224,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
5804
  template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5805
  template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
5806
  template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
 
5807
  template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
5808
  template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
5809
 
@@ -5858,6 +6279,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
5858
  template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5859
  template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
5860
  template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
 
5861
  template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
5862
  template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
5863
 
@@ -6893,6 +7315,71 @@ kernel void kernel_mul_mv_id_iq3_s_f32(
6893
  sgitg);
6894
  }
6895
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6896
  [[host_name("kernel_mul_mv_id_iq1_s_f32")]]
6897
  kernel void kernel_mul_mv_id_iq1_s_f32(
6898
  device const char * ids,
 
2519
  } block_iq2_xs;
2520
  // 74 bytes / block for QK_K = 256, so 2.3125 bpw
2521
 
2522
+ // 2.5625 bpw quants
2523
+ typedef struct {
2524
+ half d;
2525
+ uint8_t qs[QK_K/4];
2526
+ uint8_t qh[QK_K/32];
2527
+ uint8_t scales[QK_K/32];
2528
+ } block_iq2_s;
2529
+
2530
  typedef struct {
2531
  half d;
2532
  uint8_t qs[3*QK_K/8];
 
3782
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3783
  };
3784
 
3785
+ constexpr constant static uint64_t iq2s_grid[1024] = {
3786
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3787
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3788
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3789
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3790
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3791
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
3792
+ 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
3793
+ 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
3794
+ 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
3795
+ 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
3796
+ 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
3797
+ 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
3798
+ 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
3799
+ 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
3800
+ 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
3801
+ 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
3802
+ 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
3803
+ 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
3804
+ 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
3805
+ 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
3806
+ 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
3807
+ 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
3808
+ 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
3809
+ 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
3810
+ 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
3811
+ 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
3812
+ 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
3813
+ 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
3814
+ 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
3815
+ 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
3816
+ 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
3817
+ 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
3818
+ 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
3819
+ 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
3820
+ 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
3821
+ 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
3822
+ 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
3823
+ 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
3824
+ 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
3825
+ 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
3826
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
3827
+ 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
3828
+ 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
3829
+ 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
3830
+ 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
3831
+ 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
3832
+ 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
3833
+ 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
3834
+ 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
3835
+ 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
3836
+ 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
3837
+ 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
3838
+ 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
3839
+ 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
3840
+ 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
3841
+ 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
3842
+ 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
3843
+ 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
3844
+ 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
3845
+ 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
3846
+ 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
3847
+ 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
3848
+ 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
3849
+ 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
3850
+ 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
3851
+ 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
3852
+ 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
3853
+ 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
3854
+ 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
3855
+ 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
3856
+ 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
3857
+ 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
3858
+ 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
3859
+ 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
3860
+ 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
3861
+ 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
3862
+ 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
3863
+ 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
3864
+ 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
3865
+ 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
3866
+ 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
3867
+ 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
3868
+ 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
3869
+ 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
3870
+ 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
3871
+ 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
3872
+ 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
3873
+ 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
3874
+ 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
3875
+ 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
3876
+ 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
3877
+ 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
3878
+ 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
3879
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
3880
+ 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
3881
+ 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
3882
+ 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
3883
+ 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
3884
+ 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
3885
+ 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
3886
+ 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
3887
+ 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
3888
+ 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
3889
+ 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
3890
+ 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
3891
+ 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
3892
+ 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
3893
+ 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
3894
+ 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
3895
+ 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
3896
+ 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
3897
+ 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
3898
+ 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
3899
+ 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
3900
+ 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
3901
+ 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
3902
+ 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
3903
+ 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
3904
+ 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
3905
+ 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
3906
+ 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
3907
+ 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
3908
+ 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
3909
+ 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
3910
+ 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
3911
+ 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
3912
+ 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
3913
+ 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
3914
+ 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
3915
+ 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
3916
+ 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
3917
+ 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
3918
+ 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
3919
+ 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
3920
+ 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
3921
+ 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
3922
+ 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
3923
+ 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
3924
+ 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
3925
+ 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
3926
+ 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
3927
+ 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
3928
+ 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
3929
+ 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
3930
+ 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
3931
+ 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
3932
+ 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
3933
+ 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
3934
+ 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
3935
+ 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
3936
+ 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
3937
+ 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
3938
+ 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
3939
+ 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
3940
+ 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
3941
+ 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
3942
+ 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
3943
+ 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
3944
+ 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
3945
+ 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
3946
+ 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
3947
+ 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
3948
+ 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
3949
+ 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
3950
+ 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
3951
+ 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
3952
+ 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
3953
+ 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
3954
+ 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
3955
+ 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
3956
+ 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
3957
+ 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
3958
+ 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
3959
+ 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
3960
+ 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
3961
+ 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
3962
+ 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
3963
+ 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
3964
+ 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
3965
+ 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
3966
+ 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
3967
+ 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
3968
+ 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
3969
+ 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
3970
+ 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
3971
+ 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
3972
+ 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
3973
+ 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
3974
+ 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
3975
+ 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
3976
+ 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
3977
+ 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
3978
+ 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
3979
+ 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
3980
+ 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
3981
+ 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
3982
+ 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
3983
+ 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
3984
+ 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
3985
+ 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
3986
+ 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
3987
+ 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
3988
+ 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
3989
+ 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
3990
+ 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
3991
+ 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
3992
+ 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
3993
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
3994
+ 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
3995
+ 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
3996
+ 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
3997
+ 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
3998
+ 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
3999
+ 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
4000
+ 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
4001
+ 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
4002
+ 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
4003
+ 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
4004
+ 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
4005
+ 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
4006
+ 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
4007
+ 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
4008
+ 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
4009
+ 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
4010
+ 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
4011
+ 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
4012
+ 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
4013
+ 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
4014
+ 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
4015
+ 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
4016
+ 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
4017
+ 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
4018
+ 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
4019
+ 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
4020
+ 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
4021
+ 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
4022
+ 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
4023
+ 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
4024
+ 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
4025
+ 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
4026
+ 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
4027
+ 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
4028
+ 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
4029
+ 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
4030
+ 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
4031
+ 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
4032
+ 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
4033
+ 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
4034
+ 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
4035
+ 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
4036
+ 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
4037
+ 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
4038
+ 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
4039
+ 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
4040
+ 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
4041
+ 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
4042
+ };
4043
+
4044
  constexpr constant static uint32_t iq3xxs_grid[256] = {
4045
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
4046
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
 
4839
  kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
4840
  }
4841
 
4842
+ void kernel_mul_mv_iq2_s_f32_impl(
4843
+ device const void * src0,
4844
+ device const float * src1,
4845
+ device float * dst,
4846
+ constant int64_t & ne00,
4847
+ constant int64_t & ne01,
4848
+ constant int64_t & ne02,
4849
+ constant int64_t & ne10,
4850
+ constant int64_t & ne12,
4851
+ constant int64_t & ne0,
4852
+ constant int64_t & ne1,
4853
+ constant uint & r2,
4854
+ constant uint & r3,
4855
+ threadgroup int8_t * shared_values [[threadgroup(0)]],
4856
+ uint3 tgpig[[threadgroup_position_in_grid]],
4857
+ uint tiisg[[thread_index_in_simdgroup]],
4858
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
4859
+
4860
+ const int nb = ne00/QK_K;
4861
+ const int r0 = tgpig.x;
4862
+ const int r1 = tgpig.y;
4863
+ const int im = tgpig.z;
4864
+
4865
+ const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
4866
+ const int ib_row = first_row * nb;
4867
+
4868
+ const uint i12 = im%ne12;
4869
+ const uint i13 = im/ne12;
4870
+
4871
+ const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
4872
+
4873
+ device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
4874
+ device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
4875
+
4876
+ float yl[32];
4877
+ float sumf[N_DST]={0.f}, all_sum;
4878
+
4879
+ const int nb32 = nb * (QK_K / 32);
4880
+
4881
+ //threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
4882
+ //{
4883
+ // int nval = 32;
4884
+ // int pos = (32*sgitg + tiisg)*nval;
4885
+ // for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
4886
+ // threadgroup_barrier(mem_flags::mem_threadgroup);
4887
+ //}
4888
+
4889
+ const int ix = tiisg;
4890
+
4891
+ device const float * y4 = y + 32 * ix;
4892
+
4893
+ for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
4894
+
4895
+ for (int i = 0; i < 32; ++i) {
4896
+ yl[i] = y4[i];
4897
+ }
4898
+
4899
+ const int ibl = ib32 / (QK_K / 32);
4900
+ const int ib = ib32 % (QK_K / 32);
4901
+
4902
+ device const block_iq2_s * xr = x + ibl;
4903
+ device const uint8_t * qs = xr->qs + 4 * ib;
4904
+ device const uint8_t * qh = xr->qh + ib;
4905
+ device const uint8_t * sc = xr->scales + ib;
4906
+ device const uint8_t * signs = qs + QK_K/8;
4907
+ device const half * dh = &xr->d;
4908
+
4909
+ for (int row = 0; row < N_DST; row++) {
4910
+
4911
+ const float db = dh[0];
4912
+ const float d1 = db * (0.5f + (sc[0] & 0xf));
4913
+ const float d2 = db * (0.5f + (sc[0] >> 4));
4914
+
4915
+ float2 sum = {0};
4916
+ for (int l = 0; l < 2; ++l) {
4917
+ //const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
4918
+ //const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
4919
+ constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
4920
+ constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
4921
+ for (int j = 0; j < 8; ++j) {
4922
+ sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
4923
+ sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
4924
+ }
4925
+ }
4926
+ sumf[row] += d1 * sum[0] + d2 * sum[1];
4927
+
4928
+ dh += nb*sizeof(block_iq2_s)/2;
4929
+ qs += nb*sizeof(block_iq2_s);
4930
+ qh += nb*sizeof(block_iq2_s);
4931
+ sc += nb*sizeof(block_iq2_s);
4932
+ signs += nb*sizeof(block_iq2_s);
4933
+ }
4934
+
4935
+ y4 += 32 * 32;
4936
+ }
4937
+
4938
+ for (int row = 0; row < N_DST; ++row) {
4939
+ all_sum = simd_sum(sumf[row]);
4940
+ if (tiisg == 0) {
4941
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
4942
+ }
4943
+ }
4944
+ }
4945
+
4946
+ [[host_name("kernel_mul_mv_iq2_s_f32")]]
4947
+ kernel void kernel_mul_mv_iq2_s_f32(
4948
+ device const void * src0,
4949
+ device const float * src1,
4950
+ device float * dst,
4951
+ constant int64_t & ne00,
4952
+ constant int64_t & ne01,
4953
+ constant int64_t & ne02,
4954
+ constant uint64_t & nb00,
4955
+ constant uint64_t & nb01,
4956
+ constant uint64_t & nb02,
4957
+ constant int64_t & ne10,
4958
+ constant int64_t & ne11,
4959
+ constant int64_t & ne12,
4960
+ constant uint64_t & nb10,
4961
+ constant uint64_t & nb11,
4962
+ constant uint64_t & nb12,
4963
+ constant int64_t & ne0,
4964
+ constant int64_t & ne1,
4965
+ constant uint & r2,
4966
+ constant uint & r3,
4967
+ threadgroup int8_t * shared_values [[threadgroup(0)]],
4968
+ uint3 tgpig[[threadgroup_position_in_grid]],
4969
+ uint tiisg[[thread_index_in_simdgroup]],
4970
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
4971
+
4972
+ kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
4973
+ }
4974
+
4975
  void kernel_mul_mv_iq1_s_f32_impl(
4976
  device const void * src0,
4977
  device const float * src1,
 
5588
  }
5589
  }
5590
 
5591
+ template <typename type4x4>
5592
+ void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
5593
+ // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
5594
+ const float d = xb->d;
5595
+ const int ib32 = il/2;
5596
+ il = il%2;
5597
+ // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
5598
+ device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
5599
+ device const uint8_t * signs = qs + QK_K/8;
5600
+ const uint8_t qh = xb->qh[ib32] >> 4*il;
5601
+ const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
5602
+ constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
5603
+ constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
5604
+ for (int i = 0; i < 8; ++i) {
5605
+ reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
5606
+ reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
5607
+ }
5608
+ }
5609
+
5610
  template <typename type4x4>
5611
  void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
5612
  // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
 
6181
  template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
6182
  template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
6183
  template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
6184
+ template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
6185
  template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
6186
  template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
6187
 
 
6224
  template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
6225
  template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
6226
  template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
6227
+ template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
6228
  template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
6229
  template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
6230
 
 
6279
  template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
6280
  template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
6281
  template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
6282
+ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
6283
  template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
6284
  template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
6285
 
 
7315
  sgitg);
7316
  }
7317
 
7318
+ [[host_name("kernel_mul_mv_id_iq2_s_f32")]]
7319
+ kernel void kernel_mul_mv_id_iq2_s_f32(
7320
+ device const char * ids,
7321
+ device const char * src1,
7322
+ device float * dst,
7323
+ constant uint64_t & nbi1,
7324
+ constant int64_t & ne00,
7325
+ constant int64_t & ne01,
7326
+ constant int64_t & ne02,
7327
+ constant uint64_t & nb00,
7328
+ constant uint64_t & nb01,
7329
+ constant uint64_t & nb02,
7330
+ constant int64_t & ne10,
7331
+ constant int64_t & ne11,
7332
+ constant int64_t & ne12,
7333
+ constant int64_t & ne13,
7334
+ constant uint64_t & nb10,
7335
+ constant uint64_t & nb11,
7336
+ constant uint64_t & nb12,
7337
+ constant int64_t & ne0,
7338
+ constant int64_t & ne1,
7339
+ constant uint64_t & nb1,
7340
+ constant uint & r2,
7341
+ constant uint & r3,
7342
+ constant int & idx,
7343
+ device const char * src00,
7344
+ device const char * src01,
7345
+ device const char * src02,
7346
+ device const char * src03,
7347
+ device const char * src04,
7348
+ device const char * src05,
7349
+ device const char * src06,
7350
+ device const char * src07,
7351
+ threadgroup int8_t * shared_values [[threadgroup(0)]],
7352
+ uint3 tgpig[[threadgroup_position_in_grid]],
7353
+ uint tiitg[[thread_index_in_threadgroup]],
7354
+ uint tiisg[[thread_index_in_simdgroup]],
7355
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
7356
+ device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
7357
+
7358
+ const int64_t bid = tgpig.z/(ne12*ne13);
7359
+
7360
+ tgpig.z = tgpig.z%(ne12*ne13);
7361
+
7362
+ const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
7363
+
7364
+ kernel_mul_mv_iq2_s_f32_impl(
7365
+ src0[id],
7366
+ (device const float *) (src1 + bid*nb11),
7367
+ dst + bid*ne0,
7368
+ ne00,
7369
+ ne01,
7370
+ ne02,
7371
+ ne10,
7372
+ ne12,
7373
+ ne0,
7374
+ ne1,
7375
+ r2,
7376
+ r3,
7377
+ shared_values,
7378
+ tgpig,
7379
+ tiisg,
7380
+ sgitg);
7381
+ }
7382
+
7383
  [[host_name("kernel_mul_mv_id_iq1_s_f32")]]
7384
  kernel void kernel_mul_mv_id_iq1_s_f32(
7385
  device const char * ids,
ggml-quants.c CHANGED
@@ -3495,6 +3495,265 @@ static const uint64_t iq2xs_grid[512] = {
3495
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3496
  };
3497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3498
  static const uint32_t iq3xxs_grid[256] = {
3499
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
3500
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
@@ -3796,6 +4055,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
3796
  }
3797
  }
3798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3799
  // ====================== 3.0625 bpw (de)-quantization
3800
 
3801
  void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
@@ -9330,6 +9621,210 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9330
  #endif
9331
  }
9332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9333
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9334
  assert(n % QK_K == 0);
9335
  assert(nrc == 1);
@@ -9934,22 +10429,25 @@ typedef struct {
9934
  uint16_t * neighbours;
9935
  } iq2_entry_t;
9936
 
9937
- static iq2_entry_t iq2_data[3] = {
 
9938
  {NULL, NULL, NULL},
9939
  {NULL, NULL, NULL},
9940
  {NULL, NULL, NULL},
9941
  };
9942
 
9943
  static inline int iq2_data_index(enum ggml_type type) {
9944
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
9945
  return type == GGML_TYPE_IQ2_XXS ? 0 :
9946
- type == GGML_TYPE_IQ2_XS ? 1 : 2;
 
9947
  }
9948
 
9949
  static inline int iq2_grid_size(enum ggml_type type) {
9950
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
9951
  return type == GGML_TYPE_IQ2_XXS ? 256 :
9952
- type == GGML_TYPE_IQ2_XS ? 512 : 512;
 
9953
  }
9954
 
9955
  static int iq2_compare_func(const void * left, const void * right) {
@@ -10050,11 +10548,79 @@ void iq2xs_init_impl(enum ggml_type type) {
10050
  41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
10051
  42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
10052
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10053
 
10054
  const int kmap_size = 43692;
10055
- const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
 
10056
  const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
10057
- type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : kgrid_1bit_512;
 
10058
  uint64_t * kgrid_q2xs;
10059
  int * kmap_q2xs;
10060
  uint16_t * kneighbors_q2xs;
@@ -10151,7 +10717,7 @@ void iq2xs_init_impl(enum ggml_type type) {
10151
  }
10152
 
10153
  void iq2xs_free_impl(enum ggml_type type) {
10154
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
10155
  const int gindex = iq2_data_index(type);
10156
  if (iq2_data[gindex].grid) {
10157
  free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
@@ -11557,3 +12123,196 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
11557
  quantize_iq4_nl(x, y, 1, k, NULL, NULL);
11558
  }
11559
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3495
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3496
  };
3497
 
3498
+ static const uint64_t iq2s_grid[1024] = {
3499
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3500
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3501
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3502
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3503
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3504
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
3505
+ 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
3506
+ 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
3507
+ 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
3508
+ 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
3509
+ 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
3510
+ 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
3511
+ 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
3512
+ 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
3513
+ 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
3514
+ 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
3515
+ 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
3516
+ 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
3517
+ 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
3518
+ 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
3519
+ 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
3520
+ 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
3521
+ 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
3522
+ 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
3523
+ 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
3524
+ 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
3525
+ 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
3526
+ 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
3527
+ 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
3528
+ 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
3529
+ 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
3530
+ 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
3531
+ 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
3532
+ 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
3533
+ 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
3534
+ 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
3535
+ 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
3536
+ 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
3537
+ 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
3538
+ 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
3539
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
3540
+ 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
3541
+ 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
3542
+ 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
3543
+ 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
3544
+ 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
3545
+ 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
3546
+ 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
3547
+ 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
3548
+ 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
3549
+ 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
3550
+ 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
3551
+ 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
3552
+ 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
3553
+ 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
3554
+ 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
3555
+ 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
3556
+ 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
3557
+ 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
3558
+ 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
3559
+ 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
3560
+ 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
3561
+ 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
3562
+ 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
3563
+ 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
3564
+ 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
3565
+ 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
3566
+ 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
3567
+ 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
3568
+ 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
3569
+ 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
3570
+ 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
3571
+ 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
3572
+ 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
3573
+ 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
3574
+ 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
3575
+ 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
3576
+ 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
3577
+ 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
3578
+ 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
3579
+ 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
3580
+ 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
3581
+ 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
3582
+ 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
3583
+ 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
3584
+ 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
3585
+ 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
3586
+ 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
3587
+ 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
3588
+ 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
3589
+ 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
3590
+ 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
3591
+ 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
3592
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
3593
+ 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
3594
+ 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
3595
+ 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
3596
+ 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
3597
+ 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
3598
+ 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
3599
+ 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
3600
+ 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
3601
+ 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
3602
+ 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
3603
+ 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
3604
+ 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
3605
+ 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
3606
+ 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
3607
+ 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
3608
+ 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
3609
+ 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
3610
+ 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
3611
+ 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
3612
+ 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
3613
+ 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
3614
+ 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
3615
+ 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
3616
+ 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
3617
+ 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
3618
+ 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
3619
+ 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
3620
+ 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
3621
+ 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
3622
+ 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
3623
+ 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
3624
+ 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
3625
+ 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
3626
+ 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
3627
+ 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
3628
+ 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
3629
+ 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
3630
+ 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
3631
+ 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
3632
+ 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
3633
+ 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
3634
+ 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
3635
+ 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
3636
+ 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
3637
+ 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
3638
+ 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
3639
+ 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
3640
+ 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
3641
+ 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
3642
+ 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
3643
+ 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
3644
+ 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
3645
+ 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
3646
+ 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
3647
+ 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
3648
+ 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
3649
+ 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
3650
+ 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
3651
+ 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
3652
+ 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
3653
+ 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
3654
+ 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
3655
+ 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
3656
+ 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
3657
+ 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
3658
+ 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
3659
+ 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
3660
+ 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
3661
+ 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
3662
+ 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
3663
+ 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
3664
+ 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
3665
+ 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
3666
+ 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
3667
+ 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
3668
+ 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
3669
+ 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
3670
+ 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
3671
+ 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
3672
+ 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
3673
+ 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
3674
+ 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
3675
+ 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
3676
+ 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
3677
+ 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
3678
+ 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
3679
+ 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
3680
+ 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
3681
+ 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
3682
+ 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
3683
+ 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
3684
+ 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
3685
+ 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
3686
+ 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
3687
+ 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
3688
+ 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
3689
+ 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
3690
+ 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
3691
+ 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
3692
+ 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
3693
+ 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
3694
+ 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
3695
+ 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
3696
+ 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
3697
+ 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
3698
+ 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
3699
+ 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
3700
+ 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
3701
+ 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
3702
+ 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
3703
+ 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
3704
+ 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
3705
+ 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
3706
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
3707
+ 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
3708
+ 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
3709
+ 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
3710
+ 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
3711
+ 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
3712
+ 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
3713
+ 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
3714
+ 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
3715
+ 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
3716
+ 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
3717
+ 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
3718
+ 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
3719
+ 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
3720
+ 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
3721
+ 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
3722
+ 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
3723
+ 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
3724
+ 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
3725
+ 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
3726
+ 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
3727
+ 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
3728
+ 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
3729
+ 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
3730
+ 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
3731
+ 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
3732
+ 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
3733
+ 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
3734
+ 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
3735
+ 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
3736
+ 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
3737
+ 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
3738
+ 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
3739
+ 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
3740
+ 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
3741
+ 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
3742
+ 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
3743
+ 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
3744
+ 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
3745
+ 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
3746
+ 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
3747
+ 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
3748
+ 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
3749
+ 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
3750
+ 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
3751
+ 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
3752
+ 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
3753
+ 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
3754
+ 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
3755
+ };
3756
+
3757
  static const uint32_t iq3xxs_grid[256] = {
3758
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
3759
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
 
4055
  }
4056
  }
4057
 
4058
+ // ====================== 2.5625 bpw (de)-quantization
4059
+
4060
+ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
4061
+ assert(k % QK_K == 0);
4062
+ const int nb = k / QK_K;
4063
+
4064
+ float db[2];
4065
+
4066
+ for (int i = 0; i < nb; i++) {
4067
+
4068
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4069
+ const uint8_t * qs = x[i].qs;
4070
+ const uint8_t * qh = x[i].qh;
4071
+ const uint8_t * signs = qs + QK_K/8;
4072
+
4073
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
4074
+ db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
4075
+ db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
4076
+ for (int l = 0; l < 4; ++l) {
4077
+ const float dl = db[l/2];
4078
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
4079
+ for (int j = 0; j < 8; ++j) {
4080
+ y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
4081
+ }
4082
+ y += 8;
4083
+ }
4084
+ qs += 4;
4085
+ signs += 4;
4086
+ }
4087
+ }
4088
+ }
4089
+
4090
  // ====================== 3.0625 bpw (de)-quantization
4091
 
4092
  void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
 
9621
  #endif
9622
  }
9623
 
9624
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9625
+ assert(n % QK_K == 0);
9626
+ assert(nrc == 1);
9627
+ UNUSED(nrc);
9628
+ UNUSED(bx);
9629
+ UNUSED(by);
9630
+ UNUSED(bs);
9631
+
9632
+ const block_iq2_s * restrict x = vx;
9633
+ const block_q8_K * restrict y = vy;
9634
+
9635
+ const int nb = n / QK_K;
9636
+
9637
+ #if defined(__ARM_NEON)
9638
+
9639
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9640
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9641
+ };
9642
+
9643
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9644
+
9645
+ const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
9646
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
9647
+ const uint8x16_t m1 = vdupq_n_u8(1);
9648
+ const int32x4_t vzero = vdupq_n_s32(0);
9649
+
9650
+ uint8x16x2_t vs;
9651
+ ggml_int8x16x4_t q2s;
9652
+ ggml_int8x16x4_t q8b;
9653
+
9654
+ float sumf = 0;
9655
+ for (int i = 0; i < nb; ++i) {
9656
+
9657
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9658
+
9659
+ const uint8_t * restrict qs = x[i].qs;
9660
+ const uint8_t * restrict qh = x[i].qh;
9661
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9662
+ const int8_t * restrict q8 = y[i].qs;
9663
+
9664
+ int sumi1 = 0, sumi2 = 0;
9665
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9666
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9667
+ q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
9668
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
9669
+ q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
9670
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
9671
+ q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
9672
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
9673
+ q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
9674
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
9675
+ qs += 8;
9676
+
9677
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9678
+ vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9679
+ vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9680
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
9681
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
9682
+
9683
+ q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
9684
+ q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
9685
+
9686
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9687
+ vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9688
+ vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9689
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
9690
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
9691
+
9692
+ signs += 4;
9693
+
9694
+ q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
9695
+ q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
9696
+
9697
+ const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
9698
+ const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
9699
+ const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
9700
+ const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
9701
+
9702
+ sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
9703
+ sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
9704
+ sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
9705
+ sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
9706
+ }
9707
+ sumf += d*(sumi1 + sumi2);
9708
+ }
9709
+
9710
+ *s = 0.125f * sumf;
9711
+
9712
+ #elif defined(__AVX2__)
9713
+
9714
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9715
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9716
+ };
9717
+
9718
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9719
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9720
+ };
9721
+
9722
+ const __m128i m4 = _mm_set1_epi8(0xf);
9723
+ const __m128i m1 = _mm_set1_epi8(1);
9724
+
9725
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
9726
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
9727
+
9728
+ uint64_t aux64;
9729
+
9730
+ __m256 accumf = _mm256_setzero_ps();
9731
+ for (int i = 0; i < nb; ++i) {
9732
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9733
+ const uint8_t * restrict qs = x[i].qs;
9734
+ const uint8_t * restrict qh = x[i].qh;
9735
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9736
+ const int8_t * restrict q8 = y[i].qs;
9737
+
9738
+ memcpy(&aux64, x[i].scales, 8);
9739
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
9740
+ const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
9741
+
9742
+ __m256i sumi1 = _mm256_setzero_si256();
9743
+ __m256i sumi2 = _mm256_setzero_si256();
9744
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9745
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
9746
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
9747
+ const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
9748
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
9749
+ iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
9750
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
9751
+ const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
9752
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
9753
+ iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
9754
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
9755
+ qs += 8;
9756
+
9757
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
9758
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9759
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
9760
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
9761
+
9762
+ aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
9763
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9764
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
9765
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
9766
+
9767
+ signs += 4;
9768
+
9769
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
9770
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
9771
+
9772
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
9773
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
9774
+ sumi1 = _mm256_add_epi32(sumi1, p1);
9775
+ sumi2 = _mm256_add_epi32(sumi2, p2);
9776
+ }
9777
+
9778
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
9779
+
9780
+ }
9781
+
9782
+ *s = 0.125f * hsum_float_8(accumf);
9783
+
9784
+ #else
9785
+
9786
+ float sumf = 0;
9787
+ for (int i = 0; i < nb; i++) {
9788
+
9789
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9790
+ const int8_t * q8 = y[i].qs;
9791
+ const uint8_t * qs = x[i].qs;
9792
+ const uint8_t * qh = x[i].qh;
9793
+ const uint8_t * signs = qs + QK_K/8;
9794
+
9795
+ int bsum = 0;
9796
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9797
+ int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
9798
+ int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
9799
+ int sumi1 = 0, sumi2 = 0;
9800
+ for (int l = 0; l < 2; ++l) {
9801
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
9802
+ for (int j = 0; j < 8; ++j) {
9803
+ sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
9804
+ }
9805
+ q8 += 8;
9806
+ }
9807
+ for (int l = 2; l < 4; ++l) {
9808
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
9809
+ for (int j = 0; j < 8; ++j) {
9810
+ sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
9811
+ }
9812
+ q8 += 8;
9813
+ }
9814
+ bsum += ls1 * sumi1 + ls2 * sumi2;
9815
+ qs += 4;
9816
+ signs += 4;
9817
+ }
9818
+
9819
+ sumf += d * bsum;
9820
+ }
9821
+
9822
+ *s = 0.125f * sumf;
9823
+
9824
+ #endif
9825
+
9826
+ }
9827
+
9828
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9829
  assert(n % QK_K == 0);
9830
  assert(nrc == 1);
 
10429
  uint16_t * neighbours;
10430
  } iq2_entry_t;
10431
 
10432
+ static iq2_entry_t iq2_data[4] = {
10433
+ {NULL, NULL, NULL},
10434
  {NULL, NULL, NULL},
10435
  {NULL, NULL, NULL},
10436
  {NULL, NULL, NULL},
10437
  };
10438
 
10439
  static inline int iq2_data_index(enum ggml_type type) {
10440
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10441
  return type == GGML_TYPE_IQ2_XXS ? 0 :
10442
+ type == GGML_TYPE_IQ2_XS ? 1 :
10443
+ type == GGML_TYPE_IQ1_S ? 2 : 3;
10444
  }
10445
 
10446
  static inline int iq2_grid_size(enum ggml_type type) {
10447
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10448
  return type == GGML_TYPE_IQ2_XXS ? 256 :
10449
+ type == GGML_TYPE_IQ2_XS ? 512 :
10450
+ type == GGML_TYPE_IQ1_S ? 512 : 1024;
10451
  }
10452
 
10453
  static int iq2_compare_func(const void * left, const void * right) {
 
10548
  41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
10549
  42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
10550
  };
10551
+ static const uint16_t kgrid_2bit_1024[1024] = {
10552
+ 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
10553
+ 73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
10554
+ 165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
10555
+ 337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
10556
+ 517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
10557
+ 674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
10558
+ 1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
10559
+ 1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
10560
+ 1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
10561
+ 1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
10562
+ 2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
10563
+ 2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
10564
+ 2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
10565
+ 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
10566
+ 4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
10567
+ 4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
10568
+ 4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
10569
+ 4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
10570
+ 5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
10571
+ 5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
10572
+ 5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
10573
+ 5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
10574
+ 6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
10575
+ 6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
10576
+ 8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
10577
+ 8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
10578
+ 8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
10579
+ 9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
10580
+ 9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
10581
+ 10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
10582
+ 16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
10583
+ 16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
10584
+ 16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
10585
+ 16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
10586
+ 17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
10587
+ 17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
10588
+ 17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
10589
+ 17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
10590
+ 18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
10591
+ 18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
10592
+ 18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
10593
+ 20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
10594
+ 20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
10595
+ 20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
10596
+ 21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
10597
+ 21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
10598
+ 22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
10599
+ 22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
10600
+ 24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
10601
+ 24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
10602
+ 25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
10603
+ 26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
10604
+ 32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
10605
+ 33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
10606
+ 33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
10607
+ 33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
10608
+ 34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
10609
+ 35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
10610
+ 36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
10611
+ 37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
10612
+ 38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
10613
+ 39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
10614
+ 41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
10615
+ 42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
10616
+ };
10617
 
10618
  const int kmap_size = 43692;
10619
+ //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
10620
+ const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10621
  const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
10622
+ type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
10623
+ type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
10624
  uint64_t * kgrid_q2xs;
10625
  int * kmap_q2xs;
10626
  uint16_t * kneighbors_q2xs;
 
10717
  }
10718
 
10719
  void iq2xs_free_impl(enum ggml_type type) {
10720
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10721
  const int gindex = iq2_data_index(type);
10722
  if (iq2_data[gindex].grid) {
10723
  free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
 
12123
  quantize_iq4_nl(x, y, 1, k, NULL, NULL);
12124
  }
12125
 
12126
+ // =============================== 2.5625 bpw
12127
+
12128
+ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12129
+
12130
+ const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
12131
+
12132
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
12133
+ const int * kmap_q2xs = iq2_data[gindex].map;
12134
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
12135
+
12136
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
12137
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
12138
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
12139
+ GGML_ASSERT(n%QK_K == 0);
12140
+
12141
+ const int kMaxQ = 3;
12142
+
12143
+ const int nbl = n/256;
12144
+
12145
+ block_iq2_s * y = vy;
12146
+
12147
+ float scales[QK_K/16];
12148
+ float weight[16];
12149
+ float xval[16];
12150
+ int8_t L[16];
12151
+ int8_t Laux[16];
12152
+ float waux[16];
12153
+ bool is_on_grid[2];
12154
+ bool is_on_grid_aux[2];
12155
+ uint8_t block_signs[2];
12156
+
12157
+ for (int ibl = 0; ibl < nbl; ++ibl) {
12158
+
12159
+ memset(&y[ibl], 0, sizeof(block_iq2_s));
12160
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
12161
+
12162
+ float max_scale = 0;
12163
+
12164
+ const float * xbl = x + QK_K*ibl;
12165
+ float sumx2 = 0;
12166
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
12167
+ float sigma2 = 2*sumx2/QK_K;
12168
+
12169
+ for (int ib = 0; ib < QK_K/16; ++ib) {
12170
+ const float * xb = xbl + 16*ib;
12171
+ if (quant_weights) {
12172
+ const float * qw = quant_weights + QK_K*ibl + 16*ib;
12173
+ for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12174
+ } else {
12175
+ for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
12176
+ }
12177
+ for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
12178
+ for (int k = 0; k < 2; ++k) {
12179
+ uint8_t s = 0;
12180
+ for (int i = 0; i < 8; ++i) {
12181
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
12182
+ else {
12183
+ xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
12184
+ }
12185
+ }
12186
+ block_signs[k] = s;
12187
+ }
12188
+ float max = xval[0];
12189
+ for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
12190
+ if (!max) {
12191
+ scales[ib] = 0;
12192
+ continue;
12193
+ }
12194
+ float best = 0;
12195
+ float scale = max/(2*kMaxQ-1);
12196
+ is_on_grid[0] = is_on_grid[1] = true;
12197
+ for (int is = -9; is <= 9; ++is) {
12198
+ float id = (2*kMaxQ-1+is*0.1f)/max;
12199
+ float this_scale = 1/id;
12200
+ for (int k = 0; k < 2; ++k) {
12201
+ for (int i = 0; i < 8; ++i) {
12202
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
12203
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
12204
+ }
12205
+ uint16_t u = 0;
12206
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
12207
+ int grid_index = kmap_q2xs[u];
12208
+ is_on_grid_aux[k] = true;
12209
+ if (grid_index < 0) {
12210
+ is_on_grid_aux[k] = false;
12211
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12212
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
12213
+ }
12214
+ }
12215
+ float sumqx = 0, sumq2 = 0;
12216
+ for (int i = 0; i < 16; ++i) {
12217
+ float w = weight[i];
12218
+ float q = 2*Laux[i] + 1;
12219
+ sumqx += w*xval[i]*q;
12220
+ sumq2 += w*q*q;
12221
+ }
12222
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
12223
+ scale = sumqx/sumq2; best = scale*sumqx;
12224
+ for (int i = 0; i < 16; ++i) L[i] = Laux[i];
12225
+ for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
12226
+ }
12227
+ }
12228
+ int n_not_ongrid = 0;
12229
+ for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
12230
+ if (n_not_ongrid > 0 && scale > 0) {
12231
+ float id = 1/scale;
12232
+ for (int k = 0; k < 2; ++k) {
12233
+ if (is_on_grid[k]) continue;
12234
+ uint16_t u = 0;
12235
+ for (int i = 0; i < 8; ++i) {
12236
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
12237
+ l = MAX(0, MIN(kMaxQ-1, l));
12238
+ u |= (l << 2*i);
12239
+ L[8*k + i] = l;
12240
+ }
12241
+ int grid_index = kmap_q2xs[u];
12242
+ if (grid_index < 0) {
12243
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12244
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
12245
+ }
12246
+ }
12247
+ float sumqx = 0, sumq2 = 0;
12248
+ for (int i = 0; i < 16; ++i) {
12249
+ float w = weight[i];
12250
+ float q = 2*L[i] + 1;
12251
+ sumqx += w*xval[i]*q;
12252
+ sumq2 += w*q*q;
12253
+ }
12254
+ if (sumq2 > 0) scale = sumqx/sumq2;
12255
+ }
12256
+ if (scale < 0) {
12257
+ scale = -scale;
12258
+ for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
12259
+ }
12260
+ for (int k = 0; k < 2; ++k) {
12261
+ uint16_t u = 0;
12262
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
12263
+ int grid_index = kmap_q2xs[u];
12264
+ if (grid_index < 0) {
12265
+ printf("Oops: found point %u not on grid:", u);
12266
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
12267
+ printf("\n");
12268
+ GGML_ASSERT(false);
12269
+ }
12270
+ const int i8 = 2*ib + k;
12271
+ y[ibl].qs[i8] = grid_index & 255;
12272
+ y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
12273
+ y[ibl].qs[QK_K/8 + i8] = block_signs[k];
12274
+ }
12275
+ GGML_ASSERT(scale >= 0);
12276
+ scales[ib] = scale;
12277
+ max_scale = MAX(max_scale, scale);
12278
+ }
12279
+
12280
+ if (!max_scale) {
12281
+ continue;
12282
+ }
12283
+
12284
+ float d = max_scale/31;
12285
+ y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
12286
+ float id = 1/d;
12287
+ for (int ib = 0; ib < QK_K/16; ++ib) {
12288
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
12289
+ l = MAX(0, MIN(15, l));
12290
+ if (ib%2 == 0) y[ibl].scales[ib/2] = l;
12291
+ else y[ibl].scales[ib/2] |= (l << 4);
12292
+ }
12293
+ }
12294
+ }
12295
+
12296
+ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12297
+ (void)hist;
12298
+ GGML_ASSERT(n_per_row%QK_K == 0);
12299
+ int nblock = n_per_row/QK_K;
12300
+ char * qrow = (char *)dst;
12301
+ for (int row = 0; row < nrow; ++row) {
12302
+ quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
12303
+ src += n_per_row;
12304
+ qrow += nblock*sizeof(block_iq2_s);
12305
+ }
12306
+ return nrow * nblock * sizeof(block_iq2_s);
12307
+ }
12308
+
12309
+ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
12310
+ assert(k % QK_K == 0);
12311
+ quantize_iq2_s(x, y, 1, k, NULL, NULL);
12312
+ }
12313
+
12314
+ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
12315
+ assert(k % QK_K == 0);
12316
+ block_iq2_s * restrict y = vy;
12317
+ quantize_row_iq2_s_reference(x, y, k);
12318
+ }
ggml-quants.h CHANGED
@@ -182,6 +182,15 @@ typedef struct {
182
  } block_iq2_xs;
183
  static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
184
 
 
 
 
 
 
 
 
 
 
185
  // (Almost) "true" 3-bit quantization.
186
  // Due to the need to use blocks as per ggml design, it ends up using
187
  // 3.0625 bpw because of the 16-bit scale for each block of 256.
@@ -242,6 +251,7 @@ void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGM
242
  void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
243
  void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
244
  void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
 
245
 
246
  void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
247
  void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@@ -259,6 +269,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
259
  void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
260
  void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
261
  void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 
262
 
263
  // Dequantization
264
  void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@@ -276,6 +287,7 @@ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRI
276
  void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
277
  void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
278
  void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 
279
  void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
280
  void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
281
  void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@@ -295,6 +307,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
295
  void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
296
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
297
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
298
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
299
  void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
300
  void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -305,6 +318,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
305
  //
306
  size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
307
  size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 
308
  size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
309
  size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
310
  size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 
182
  } block_iq2_xs;
183
  static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
184
 
185
+ // 2.5625 bpw quants
186
+ typedef struct {
187
+ ggml_fp16_t d;
188
+ uint8_t qs[QK_K/4];
189
+ uint8_t qh[QK_K/32];
190
+ uint8_t scales[QK_K/32];
191
+ } block_iq2_s;
192
+ static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
193
+
194
  // (Almost) "true" 3-bit quantization.
195
  // Due to the need to use blocks as per ggml design, it ends up using
196
  // 3.0625 bpw because of the 16-bit scale for each block of 256.
 
251
  void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
252
  void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
253
  void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
254
+ void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
255
 
256
  void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
257
  void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 
269
  void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
270
  void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
271
  void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
272
+ void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
273
 
274
  // Dequantization
275
  void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 
287
  void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
288
  void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
289
  void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
290
+ void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
291
  void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
292
  void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
293
  void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 
307
  void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
308
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
309
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
310
+ void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
311
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
312
  void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
313
  void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
318
  //
319
  size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
320
  size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
321
+ size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
322
  size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
323
  size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
324
  size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
ggml.c CHANGED
@@ -694,6 +694,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
694
  .vec_dot_type = GGML_TYPE_Q8_K,
695
  .nrows = 1,
696
  },
 
 
 
 
 
 
 
 
 
 
 
 
697
  [GGML_TYPE_IQ1_S] = {
698
  .type_name = "iq1_s",
699
  .blck_size = QK_K,
@@ -2327,6 +2339,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2327
  case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2328
  case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2329
  case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
 
2330
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2331
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2332
  }
@@ -7764,6 +7777,7 @@ static void ggml_compute_forward_add(
7764
  case GGML_TYPE_IQ1_S:
7765
  case GGML_TYPE_IQ4_NL:
7766
  case GGML_TYPE_IQ3_S:
 
7767
  {
7768
  ggml_compute_forward_add_q_f32(params, dst);
7769
  } break;
@@ -8044,6 +8058,7 @@ static void ggml_compute_forward_add1(
8044
  case GGML_TYPE_IQ1_S:
8045
  case GGML_TYPE_IQ4_NL:
8046
  case GGML_TYPE_IQ3_S:
 
8047
  {
8048
  ggml_compute_forward_add1_q_f32(params, dst);
8049
  } break;
@@ -8169,6 +8184,7 @@ static void ggml_compute_forward_acc(
8169
  case GGML_TYPE_IQ1_S:
8170
  case GGML_TYPE_IQ4_NL:
8171
  case GGML_TYPE_IQ3_S:
 
8172
  default:
8173
  {
8174
  GGML_ASSERT(false);
@@ -11068,6 +11084,7 @@ static void ggml_compute_forward_out_prod(
11068
  case GGML_TYPE_IQ1_S:
11069
  case GGML_TYPE_IQ4_NL:
11070
  case GGML_TYPE_IQ3_S:
 
11071
  {
11072
  ggml_compute_forward_out_prod_q_f32(params, dst);
11073
  } break;
@@ -11257,6 +11274,7 @@ static void ggml_compute_forward_set(
11257
  case GGML_TYPE_IQ1_S:
11258
  case GGML_TYPE_IQ4_NL:
11259
  case GGML_TYPE_IQ3_S:
 
11260
  default:
11261
  {
11262
  GGML_ASSERT(false);
@@ -11460,6 +11478,7 @@ static void ggml_compute_forward_get_rows(
11460
  case GGML_TYPE_IQ1_S:
11461
  case GGML_TYPE_IQ4_NL:
11462
  case GGML_TYPE_IQ3_S:
 
11463
  {
11464
  ggml_compute_forward_get_rows_q(params, dst);
11465
  } break;
@@ -12161,6 +12180,7 @@ static void ggml_compute_forward_alibi(
12161
  case GGML_TYPE_IQ1_S:
12162
  case GGML_TYPE_IQ4_NL:
12163
  case GGML_TYPE_IQ3_S:
 
12164
  case GGML_TYPE_Q8_K:
12165
  case GGML_TYPE_I8:
12166
  case GGML_TYPE_I16:
@@ -12245,6 +12265,7 @@ static void ggml_compute_forward_clamp(
12245
  case GGML_TYPE_IQ1_S:
12246
  case GGML_TYPE_IQ4_NL:
12247
  case GGML_TYPE_IQ3_S:
 
12248
  case GGML_TYPE_Q8_K:
12249
  case GGML_TYPE_I8:
12250
  case GGML_TYPE_I16:
@@ -19500,6 +19521,7 @@ void ggml_quantize_init(enum ggml_type type) {
19500
  switch (type) {
19501
  case GGML_TYPE_IQ2_XXS:
19502
  case GGML_TYPE_IQ2_XS:
 
19503
  case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
19504
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
19505
  case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
@@ -19786,6 +19808,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19786
  result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19787
  GGML_ASSERT(result == row_size * nrows);
19788
  } break;
 
 
 
 
 
 
 
 
 
19789
  case GGML_TYPE_IQ1_S:
19790
  {
19791
  GGML_ASSERT(start % QK_K == 0);
 
694
  .vec_dot_type = GGML_TYPE_Q8_K,
695
  .nrows = 1,
696
  },
697
+ [GGML_TYPE_IQ2_S] = {
698
+ .type_name = "iq2_s",
699
+ .blck_size = QK_K,
700
+ .type_size = sizeof(block_iq2_s),
701
+ .is_quantized = true,
702
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_s,
703
+ .from_float = quantize_row_iq2_s,
704
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
705
+ .vec_dot = ggml_vec_dot_iq2_s_q8_K,
706
+ .vec_dot_type = GGML_TYPE_Q8_K,
707
+ .nrows = 1,
708
+ },
709
  [GGML_TYPE_IQ1_S] = {
710
  .type_name = "iq1_s",
711
  .blck_size = QK_K,
 
2339
  case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2340
  case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2341
  case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
2342
+ case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
2343
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2344
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2345
  }
 
7777
  case GGML_TYPE_IQ1_S:
7778
  case GGML_TYPE_IQ4_NL:
7779
  case GGML_TYPE_IQ3_S:
7780
+ case GGML_TYPE_IQ2_S:
7781
  {
7782
  ggml_compute_forward_add_q_f32(params, dst);
7783
  } break;
 
8058
  case GGML_TYPE_IQ1_S:
8059
  case GGML_TYPE_IQ4_NL:
8060
  case GGML_TYPE_IQ3_S:
8061
+ case GGML_TYPE_IQ2_S:
8062
  {
8063
  ggml_compute_forward_add1_q_f32(params, dst);
8064
  } break;
 
8184
  case GGML_TYPE_IQ1_S:
8185
  case GGML_TYPE_IQ4_NL:
8186
  case GGML_TYPE_IQ3_S:
8187
+ case GGML_TYPE_IQ2_S:
8188
  default:
8189
  {
8190
  GGML_ASSERT(false);
 
11084
  case GGML_TYPE_IQ1_S:
11085
  case GGML_TYPE_IQ4_NL:
11086
  case GGML_TYPE_IQ3_S:
11087
+ case GGML_TYPE_IQ2_S:
11088
  {
11089
  ggml_compute_forward_out_prod_q_f32(params, dst);
11090
  } break;
 
11274
  case GGML_TYPE_IQ1_S:
11275
  case GGML_TYPE_IQ4_NL:
11276
  case GGML_TYPE_IQ3_S:
11277
+ case GGML_TYPE_IQ2_S:
11278
  default:
11279
  {
11280
  GGML_ASSERT(false);
 
11478
  case GGML_TYPE_IQ1_S:
11479
  case GGML_TYPE_IQ4_NL:
11480
  case GGML_TYPE_IQ3_S:
11481
+ case GGML_TYPE_IQ2_S:
11482
  {
11483
  ggml_compute_forward_get_rows_q(params, dst);
11484
  } break;
 
12180
  case GGML_TYPE_IQ1_S:
12181
  case GGML_TYPE_IQ4_NL:
12182
  case GGML_TYPE_IQ3_S:
12183
+ case GGML_TYPE_IQ2_S:
12184
  case GGML_TYPE_Q8_K:
12185
  case GGML_TYPE_I8:
12186
  case GGML_TYPE_I16:
 
12265
  case GGML_TYPE_IQ1_S:
12266
  case GGML_TYPE_IQ4_NL:
12267
  case GGML_TYPE_IQ3_S:
12268
+ case GGML_TYPE_IQ2_S:
12269
  case GGML_TYPE_Q8_K:
12270
  case GGML_TYPE_I8:
12271
  case GGML_TYPE_I16:
 
19521
  switch (type) {
19522
  case GGML_TYPE_IQ2_XXS:
19523
  case GGML_TYPE_IQ2_XS:
19524
+ case GGML_TYPE_IQ2_S:
19525
  case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
19526
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
19527
  case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
 
19808
  result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19809
  GGML_ASSERT(result == row_size * nrows);
19810
  } break;
19811
+ case GGML_TYPE_IQ2_S:
19812
+ {
19813
+ GGML_ASSERT(start % QK_K == 0);
19814
+ GGML_ASSERT(start % n_per_row == 0);
19815
+ size_t start_row = start / n_per_row;
19816
+ size_t row_size = ggml_row_size(type, n_per_row);
19817
+ result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19818
+ GGML_ASSERT(result == row_size * nrows);
19819
+ } break;
19820
  case GGML_TYPE_IQ1_S:
19821
  {
19822
  GGML_ASSERT(start % QK_K == 0);
ggml.h CHANGED
@@ -351,6 +351,7 @@ extern "C" {
351
  GGML_TYPE_IQ1_S = 19,
352
  GGML_TYPE_IQ4_NL = 20,
353
  GGML_TYPE_IQ3_S = 21,
 
354
  GGML_TYPE_I8,
355
  GGML_TYPE_I16,
356
  GGML_TYPE_I32,
@@ -391,6 +392,7 @@ extern "C" {
391
  GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
392
  GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
393
  GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
 
394
  };
395
 
396
  // available tensor operations:
 
351
  GGML_TYPE_IQ1_S = 19,
352
  GGML_TYPE_IQ4_NL = 20,
353
  GGML_TYPE_IQ3_S = 21,
354
+ GGML_TYPE_IQ2_S = 22,
355
  GGML_TYPE_I8,
356
  GGML_TYPE_I16,
357
  GGML_TYPE_I32,
 
392
  GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
393
  GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
394
  GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
395
+ GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
396
  };
397
 
398
  // available tensor operations: