Spaces:
Running
Running
Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)
Browse files* Adding IQ2_S and IQ2_M as a single cumulative commit
* Update examples/quantize/quantize.cpp
Co-authored-by: Georgi Gerganov <[email protected]>
---------
Co-authored-by: Iwan Kawrakow <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>
- ggml-cuda.cu +357 -1
- ggml-metal.m +31 -6
- ggml-metal.metal +487 -0
- ggml-quants.c +767 -8
- ggml-quants.h +14 -0
- ggml.c +31 -0
- ggml.h +2 -0
ggml-cuda.cu
CHANGED
|
@@ -523,6 +523,17 @@ typedef struct {
|
|
| 523 |
} block_iq2_xs;
|
| 524 |
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
| 525 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
#define QR3_XXS 8
|
| 527 |
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
| 528 |
typedef struct {
|
|
@@ -1689,6 +1700,265 @@ static const __device__ uint64_t iq2xs_grid[512] = {
|
|
| 1689 |
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
| 1690 |
};
|
| 1691 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1692 |
static const __device__ uint32_t iq3xxs_grid[256] = {
|
| 1693 |
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
| 1694 |
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
|
@@ -2037,6 +2307,27 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
|
| 2037 |
|
| 2038 |
}
|
| 2039 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2040 |
template<typename dst_t>
|
| 2041 |
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 2042 |
|
|
@@ -4800,6 +5091,54 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|
| 4800 |
#endif
|
| 4801 |
}
|
| 4802 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4803 |
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
| 4804 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 4805 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
@@ -6996,6 +7335,12 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
|
|
| 6996 |
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
| 6997 |
}
|
| 6998 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6999 |
template<typename dst_t>
|
| 7000 |
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 7001 |
const int nb = k / QK_K;
|
|
@@ -7057,6 +7402,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
| 7057 |
return dequantize_row_iq2_xxs_cuda;
|
| 7058 |
case GGML_TYPE_IQ2_XS:
|
| 7059 |
return dequantize_row_iq2_xs_cuda;
|
|
|
|
|
|
|
| 7060 |
case GGML_TYPE_IQ3_XXS:
|
| 7061 |
return dequantize_row_iq3_xxs_cuda;
|
| 7062 |
case GGML_TYPE_IQ1_S:
|
|
@@ -7098,6 +7445,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
| 7098 |
return dequantize_row_iq2_xxs_cuda;
|
| 7099 |
case GGML_TYPE_IQ2_XS:
|
| 7100 |
return dequantize_row_iq2_xs_cuda;
|
|
|
|
|
|
|
| 7101 |
case GGML_TYPE_IQ3_XXS:
|
| 7102 |
return dequantize_row_iq3_xxs_cuda;
|
| 7103 |
case GGML_TYPE_IQ1_S:
|
|
@@ -8848,6 +9197,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
| 8848 |
case GGML_TYPE_Q6_K:
|
| 8849 |
case GGML_TYPE_IQ2_XXS:
|
| 8850 |
case GGML_TYPE_IQ2_XS:
|
|
|
|
| 8851 |
case GGML_TYPE_IQ3_XXS:
|
| 8852 |
case GGML_TYPE_IQ1_S:
|
| 8853 |
case GGML_TYPE_IQ4_NL:
|
|
@@ -8874,6 +9224,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
| 8874 |
case GGML_TYPE_Q5_K:
|
| 8875 |
case GGML_TYPE_IQ2_XXS:
|
| 8876 |
case GGML_TYPE_IQ2_XS:
|
|
|
|
| 8877 |
case GGML_TYPE_IQ3_XXS:
|
| 8878 |
case GGML_TYPE_IQ1_S:
|
| 8879 |
case GGML_TYPE_IQ4_NL:
|
|
@@ -8971,6 +9322,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
| 8971 |
mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
|
| 8972 |
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
| 8973 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8974 |
case GGML_TYPE_IQ3_XXS:
|
| 8975 |
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
| 8976 |
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
@@ -11710,7 +12065,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 11710 |
}
|
| 11711 |
ggml_type a_type = a->type;
|
| 11712 |
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
| 11713 |
-
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S
|
|
|
|
| 11714 |
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
| 11715 |
return false;
|
| 11716 |
}
|
|
|
|
| 523 |
} block_iq2_xs;
|
| 524 |
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
| 525 |
|
| 526 |
+
// 2.5625 bpw quants
|
| 527 |
+
#define QR2_S 8
|
| 528 |
+
#define QI2_S (QK_K / (4*QR2_S))
|
| 529 |
+
typedef struct {
|
| 530 |
+
half d;
|
| 531 |
+
uint8_t qs[QK_K/4];
|
| 532 |
+
uint8_t qh[QK_K/32];
|
| 533 |
+
uint8_t scales[QK_K/32];
|
| 534 |
+
} block_iq2_s;
|
| 535 |
+
static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
|
| 536 |
+
|
| 537 |
#define QR3_XXS 8
|
| 538 |
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
| 539 |
typedef struct {
|
|
|
|
| 1700 |
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
| 1701 |
};
|
| 1702 |
|
| 1703 |
+
static const __device__ uint64_t iq2s_grid[1024] = {
|
| 1704 |
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
| 1705 |
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
| 1706 |
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
| 1707 |
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
| 1708 |
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
| 1709 |
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
| 1710 |
+
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
| 1711 |
+
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
| 1712 |
+
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
| 1713 |
+
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
| 1714 |
+
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
| 1715 |
+
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
| 1716 |
+
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
| 1717 |
+
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
| 1718 |
+
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
| 1719 |
+
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
| 1720 |
+
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
| 1721 |
+
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
| 1722 |
+
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
| 1723 |
+
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
| 1724 |
+
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
| 1725 |
+
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
| 1726 |
+
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
| 1727 |
+
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
| 1728 |
+
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
| 1729 |
+
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
| 1730 |
+
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
| 1731 |
+
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
| 1732 |
+
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
| 1733 |
+
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
| 1734 |
+
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
| 1735 |
+
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
| 1736 |
+
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
| 1737 |
+
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
| 1738 |
+
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
| 1739 |
+
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
| 1740 |
+
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
| 1741 |
+
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
| 1742 |
+
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
| 1743 |
+
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
| 1744 |
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
| 1745 |
+
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
| 1746 |
+
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
| 1747 |
+
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
| 1748 |
+
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
| 1749 |
+
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
| 1750 |
+
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
| 1751 |
+
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
| 1752 |
+
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
| 1753 |
+
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
| 1754 |
+
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
| 1755 |
+
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
| 1756 |
+
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
| 1757 |
+
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
| 1758 |
+
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
| 1759 |
+
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
| 1760 |
+
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
| 1761 |
+
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
| 1762 |
+
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
| 1763 |
+
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
| 1764 |
+
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
| 1765 |
+
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
| 1766 |
+
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
| 1767 |
+
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
| 1768 |
+
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
| 1769 |
+
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
| 1770 |
+
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
| 1771 |
+
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
| 1772 |
+
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
| 1773 |
+
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
| 1774 |
+
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
| 1775 |
+
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
| 1776 |
+
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
| 1777 |
+
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
| 1778 |
+
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
| 1779 |
+
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
| 1780 |
+
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
| 1781 |
+
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
| 1782 |
+
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
| 1783 |
+
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
| 1784 |
+
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
| 1785 |
+
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
| 1786 |
+
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
| 1787 |
+
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
| 1788 |
+
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
| 1789 |
+
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
| 1790 |
+
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
| 1791 |
+
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
| 1792 |
+
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
| 1793 |
+
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
| 1794 |
+
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
| 1795 |
+
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
| 1796 |
+
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
| 1797 |
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
| 1798 |
+
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
| 1799 |
+
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
| 1800 |
+
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
| 1801 |
+
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
| 1802 |
+
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
| 1803 |
+
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
| 1804 |
+
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
| 1805 |
+
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
| 1806 |
+
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
| 1807 |
+
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
| 1808 |
+
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
| 1809 |
+
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
| 1810 |
+
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
| 1811 |
+
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
| 1812 |
+
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
| 1813 |
+
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
| 1814 |
+
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
| 1815 |
+
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
| 1816 |
+
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
| 1817 |
+
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
| 1818 |
+
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
| 1819 |
+
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
| 1820 |
+
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
| 1821 |
+
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
| 1822 |
+
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
| 1823 |
+
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
| 1824 |
+
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
| 1825 |
+
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
| 1826 |
+
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
| 1827 |
+
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
| 1828 |
+
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
| 1829 |
+
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
| 1830 |
+
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
| 1831 |
+
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
| 1832 |
+
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
| 1833 |
+
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
| 1834 |
+
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
| 1835 |
+
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
| 1836 |
+
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
| 1837 |
+
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
| 1838 |
+
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
| 1839 |
+
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
| 1840 |
+
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
| 1841 |
+
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
| 1842 |
+
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
| 1843 |
+
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
| 1844 |
+
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
| 1845 |
+
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
| 1846 |
+
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
| 1847 |
+
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
| 1848 |
+
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
| 1849 |
+
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
| 1850 |
+
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
| 1851 |
+
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
| 1852 |
+
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
| 1853 |
+
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
| 1854 |
+
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
| 1855 |
+
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
| 1856 |
+
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
| 1857 |
+
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
| 1858 |
+
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
| 1859 |
+
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
| 1860 |
+
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
| 1861 |
+
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
| 1862 |
+
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
| 1863 |
+
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
| 1864 |
+
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
| 1865 |
+
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
| 1866 |
+
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
| 1867 |
+
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
| 1868 |
+
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
| 1869 |
+
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
| 1870 |
+
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
| 1871 |
+
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
| 1872 |
+
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
| 1873 |
+
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
| 1874 |
+
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
| 1875 |
+
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
| 1876 |
+
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
| 1877 |
+
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
| 1878 |
+
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
| 1879 |
+
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
| 1880 |
+
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
| 1881 |
+
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
| 1882 |
+
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
| 1883 |
+
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
| 1884 |
+
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
| 1885 |
+
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
| 1886 |
+
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
| 1887 |
+
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
| 1888 |
+
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
| 1889 |
+
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
| 1890 |
+
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
| 1891 |
+
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
| 1892 |
+
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
| 1893 |
+
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
| 1894 |
+
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
| 1895 |
+
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
| 1896 |
+
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
| 1897 |
+
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
| 1898 |
+
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
| 1899 |
+
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
| 1900 |
+
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
| 1901 |
+
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
| 1902 |
+
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
| 1903 |
+
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
| 1904 |
+
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
| 1905 |
+
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
| 1906 |
+
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
| 1907 |
+
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
| 1908 |
+
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
| 1909 |
+
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
| 1910 |
+
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
| 1911 |
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
| 1912 |
+
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
| 1913 |
+
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
| 1914 |
+
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
| 1915 |
+
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
| 1916 |
+
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
| 1917 |
+
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
| 1918 |
+
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
| 1919 |
+
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
| 1920 |
+
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
| 1921 |
+
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
| 1922 |
+
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
| 1923 |
+
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
| 1924 |
+
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
| 1925 |
+
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
| 1926 |
+
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
| 1927 |
+
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
| 1928 |
+
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
| 1929 |
+
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
| 1930 |
+
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
| 1931 |
+
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
| 1932 |
+
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
| 1933 |
+
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
| 1934 |
+
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
| 1935 |
+
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
| 1936 |
+
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
| 1937 |
+
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
| 1938 |
+
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
| 1939 |
+
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
| 1940 |
+
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
| 1941 |
+
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
| 1942 |
+
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
| 1943 |
+
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
| 1944 |
+
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
| 1945 |
+
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
| 1946 |
+
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
| 1947 |
+
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
| 1948 |
+
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
| 1949 |
+
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
| 1950 |
+
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
| 1951 |
+
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
| 1952 |
+
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
| 1953 |
+
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
| 1954 |
+
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
| 1955 |
+
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
| 1956 |
+
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
| 1957 |
+
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
| 1958 |
+
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
| 1959 |
+
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
| 1960 |
+
};
|
| 1961 |
+
|
| 1962 |
static const __device__ uint32_t iq3xxs_grid[256] = {
|
| 1963 |
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
| 1964 |
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
|
|
|
| 2307 |
|
| 2308 |
}
|
| 2309 |
|
| 2310 |
+
template<typename dst_t>
|
| 2311 |
+
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 2312 |
+
|
| 2313 |
+
const int i = blockIdx.x;
|
| 2314 |
+
const block_iq2_s * x = (const block_iq2_s *) vx;
|
| 2315 |
+
|
| 2316 |
+
const int tid = threadIdx.x;
|
| 2317 |
+
#if QK_K == 256
|
| 2318 |
+
const int il = tid/8; // 0...3
|
| 2319 |
+
const int ib = tid%8; // 0...7
|
| 2320 |
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 2321 |
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
| 2322 |
+
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 2323 |
+
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
| 2324 |
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 2325 |
+
#else
|
| 2326 |
+
assert(false);
|
| 2327 |
+
#endif
|
| 2328 |
+
|
| 2329 |
+
}
|
| 2330 |
+
|
| 2331 |
template<typename dst_t>
|
| 2332 |
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 2333 |
|
|
|
|
| 5091 |
#endif
|
| 5092 |
}
|
| 5093 |
|
| 5094 |
+
// TODO
|
| 5095 |
+
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
| 5096 |
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 5097 |
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 5098 |
+
#if QK_K == 256
|
| 5099 |
+
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
| 5100 |
+
|
| 5101 |
+
const int ib32 = iqs;
|
| 5102 |
+
const int8_t * q8 = bq8_1[ib32].qs;
|
| 5103 |
+
const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
|
| 5104 |
+
const uint8_t ls1 = bq2->scales[ib32] & 0xf;
|
| 5105 |
+
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
| 5106 |
+
int sumi1 = 0;
|
| 5107 |
+
for (int l = 0; l < 2; ++l) {
|
| 5108 |
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
| 5109 |
+
const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
| 5110 |
+
const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
| 5111 |
+
const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
|
| 5112 |
+
const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
|
| 5113 |
+
sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
|
| 5114 |
+
sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
|
| 5115 |
+
q8 += 8;
|
| 5116 |
+
}
|
| 5117 |
+
int sumi2 = 0;
|
| 5118 |
+
for (int l = 2; l < 4; ++l) {
|
| 5119 |
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
| 5120 |
+
const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
| 5121 |
+
const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
| 5122 |
+
const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
|
| 5123 |
+
const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
|
| 5124 |
+
sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
|
| 5125 |
+
sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
|
| 5126 |
+
q8 += 8;
|
| 5127 |
+
}
|
| 5128 |
+
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
| 5129 |
+
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
| 5130 |
+
#else
|
| 5131 |
+
(void) ksigns64;
|
| 5132 |
+
assert(false);
|
| 5133 |
+
return 0.f;
|
| 5134 |
+
#endif
|
| 5135 |
+
#else
|
| 5136 |
+
(void) ksigns64;
|
| 5137 |
+
assert(false);
|
| 5138 |
+
return 0.f;
|
| 5139 |
+
#endif
|
| 5140 |
+
}
|
| 5141 |
+
|
| 5142 |
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
| 5143 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 5144 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
|
|
| 7335 |
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
| 7336 |
}
|
| 7337 |
|
| 7338 |
+
template<typename dst_t>
|
| 7339 |
+
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 7340 |
+
const int nb = k / QK_K;
|
| 7341 |
+
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
| 7342 |
+
}
|
| 7343 |
+
|
| 7344 |
template<typename dst_t>
|
| 7345 |
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 7346 |
const int nb = k / QK_K;
|
|
|
|
| 7402 |
return dequantize_row_iq2_xxs_cuda;
|
| 7403 |
case GGML_TYPE_IQ2_XS:
|
| 7404 |
return dequantize_row_iq2_xs_cuda;
|
| 7405 |
+
case GGML_TYPE_IQ2_S:
|
| 7406 |
+
return dequantize_row_iq2_s_cuda;
|
| 7407 |
case GGML_TYPE_IQ3_XXS:
|
| 7408 |
return dequantize_row_iq3_xxs_cuda;
|
| 7409 |
case GGML_TYPE_IQ1_S:
|
|
|
|
| 7445 |
return dequantize_row_iq2_xxs_cuda;
|
| 7446 |
case GGML_TYPE_IQ2_XS:
|
| 7447 |
return dequantize_row_iq2_xs_cuda;
|
| 7448 |
+
case GGML_TYPE_IQ2_S:
|
| 7449 |
+
return dequantize_row_iq2_s_cuda;
|
| 7450 |
case GGML_TYPE_IQ3_XXS:
|
| 7451 |
return dequantize_row_iq3_xxs_cuda;
|
| 7452 |
case GGML_TYPE_IQ1_S:
|
|
|
|
| 9197 |
case GGML_TYPE_Q6_K:
|
| 9198 |
case GGML_TYPE_IQ2_XXS:
|
| 9199 |
case GGML_TYPE_IQ2_XS:
|
| 9200 |
+
case GGML_TYPE_IQ2_S:
|
| 9201 |
case GGML_TYPE_IQ3_XXS:
|
| 9202 |
case GGML_TYPE_IQ1_S:
|
| 9203 |
case GGML_TYPE_IQ4_NL:
|
|
|
|
| 9224 |
case GGML_TYPE_Q5_K:
|
| 9225 |
case GGML_TYPE_IQ2_XXS:
|
| 9226 |
case GGML_TYPE_IQ2_XS:
|
| 9227 |
+
case GGML_TYPE_IQ2_S:
|
| 9228 |
case GGML_TYPE_IQ3_XXS:
|
| 9229 |
case GGML_TYPE_IQ1_S:
|
| 9230 |
case GGML_TYPE_IQ4_NL:
|
|
|
|
| 9322 |
mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
|
| 9323 |
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
| 9324 |
break;
|
| 9325 |
+
case GGML_TYPE_IQ2_S:
|
| 9326 |
+
mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
|
| 9327 |
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
| 9328 |
+
break;
|
| 9329 |
case GGML_TYPE_IQ3_XXS:
|
| 9330 |
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
| 9331 |
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
|
|
| 12065 |
}
|
| 12066 |
ggml_type a_type = a->type;
|
| 12067 |
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
| 12068 |
+
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
|
| 12069 |
+
a_type == GGML_TYPE_IQ2_S) {
|
| 12070 |
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
| 12071 |
return false;
|
| 12072 |
}
|
ggml-metal.m
CHANGED
|
@@ -62,6 +62,7 @@ enum ggml_metal_kernel_type {
|
|
| 62 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
| 63 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
| 64 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
|
|
|
|
| 65 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
|
| 66 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
|
| 67 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
|
@@ -87,6 +88,7 @@ enum ggml_metal_kernel_type {
|
|
| 87 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
| 88 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
| 89 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
|
|
|
|
| 90 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
|
| 91 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
|
| 92 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
|
@@ -108,6 +110,7 @@ enum ggml_metal_kernel_type {
|
|
| 108 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
| 109 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
| 110 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
|
|
|
|
| 111 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
|
| 112 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
|
| 113 |
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
|
@@ -126,6 +129,7 @@ enum ggml_metal_kernel_type {
|
|
| 126 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
| 127 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
| 128 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
|
|
|
|
| 129 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
|
| 130 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
|
| 131 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
|
@@ -144,6 +148,7 @@ enum ggml_metal_kernel_type {
|
|
| 144 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
| 145 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
| 146 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
|
|
|
|
| 147 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
|
| 148 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
|
| 149 |
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
|
@@ -458,6 +463,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 458 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
| 459 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
| 460 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true);
|
|
|
|
| 461 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
|
| 462 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
|
| 463 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
|
@@ -483,6 +489,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 483 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
| 484 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
| 485 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction);
|
|
|
|
| 486 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
|
| 487 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
| 488 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
|
@@ -504,6 +511,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 504 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
| 505 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
| 506 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction);
|
|
|
|
| 507 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
|
| 508 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
| 509 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
|
@@ -522,6 +530,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 522 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
| 523 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
| 524 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm);
|
|
|
|
| 525 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
|
| 526 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
|
| 527 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
|
@@ -540,6 +549,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 540 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
| 541 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
| 542 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm);
|
|
|
|
| 543 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
|
| 544 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
|
| 545 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
|
@@ -1358,6 +1368,7 @@ static bool ggml_metal_graph_compute(
|
|
| 1358 |
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
| 1359 |
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
| 1360 |
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
|
|
|
|
| 1361 |
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
| 1362 |
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
| 1363 |
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
|
@@ -1500,6 +1511,12 @@ static bool ggml_metal_graph_compute(
|
|
| 1500 |
nth1 = 16;
|
| 1501 |
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
|
| 1502 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1503 |
case GGML_TYPE_IQ1_S:
|
| 1504 |
{
|
| 1505 |
nth0 = 4;
|
|
@@ -1544,9 +1561,9 @@ static bool ggml_metal_graph_compute(
|
|
| 1544 |
[encoder setBytes:&r2 length:sizeof(r2) atIndex:17];
|
| 1545 |
[encoder setBytes:&r3 length:sizeof(r3) atIndex:18];
|
| 1546 |
|
| 1547 |
-
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1
|
| 1548 |
-
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1
|
| 1549 |
-
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S
|
| 1550 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 1551 |
}
|
| 1552 |
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
|
@@ -1658,6 +1675,7 @@ static bool ggml_metal_graph_compute(
|
|
| 1658 |
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
| 1659 |
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
| 1660 |
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
|
|
|
|
| 1661 |
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
|
| 1662 |
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
| 1663 |
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
|
@@ -1803,6 +1821,12 @@ static bool ggml_metal_graph_compute(
|
|
| 1803 |
nth1 = 16;
|
| 1804 |
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
|
| 1805 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1806 |
case GGML_TYPE_IQ1_S:
|
| 1807 |
{
|
| 1808 |
nth0 = 4;
|
|
@@ -1863,9 +1887,9 @@ static bool ggml_metal_graph_compute(
|
|
| 1863 |
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
|
| 1864 |
}
|
| 1865 |
|
| 1866 |
-
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1
|
| 1867 |
-
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1
|
| 1868 |
-
src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S
|
| 1869 |
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 1870 |
}
|
| 1871 |
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
|
|
@@ -1925,6 +1949,7 @@ static bool ggml_metal_graph_compute(
|
|
| 1925 |
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
| 1926 |
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
| 1927 |
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break;
|
|
|
|
| 1928 |
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
|
| 1929 |
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
| 1930 |
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
|
|
|
| 62 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
| 63 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
| 64 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
|
| 65 |
+
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,
|
| 66 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
|
| 67 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
|
| 68 |
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
|
|
|
| 88 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
| 89 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
| 90 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
|
| 91 |
+
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,
|
| 92 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
|
| 93 |
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
|
| 94 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
|
|
|
| 110 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
| 111 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
| 112 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
|
| 113 |
+
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,
|
| 114 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
|
| 115 |
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
|
| 116 |
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
|
|
|
| 129 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
| 130 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
| 131 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
|
| 132 |
+
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,
|
| 133 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
|
| 134 |
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
|
| 135 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
|
|
|
| 148 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
| 149 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
| 150 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
|
| 151 |
+
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
|
| 152 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
|
| 153 |
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
|
| 154 |
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
|
|
|
| 463 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
| 464 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
| 465 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true);
|
| 466 |
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, get_rows_iq2_s, true);
|
| 467 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
|
| 468 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
|
| 469 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
|
|
|
| 489 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
| 490 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
| 491 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction);
|
| 492 |
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction);
|
| 493 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
|
| 494 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
| 495 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
|
|
|
| 511 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
| 512 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
| 513 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction);
|
| 514 |
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction);
|
| 515 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
|
| 516 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
| 517 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
|
|
|
| 530 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
| 531 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
| 532 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm);
|
| 533 |
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm);
|
| 534 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
|
| 535 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
|
| 536 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
|
|
|
| 549 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
| 550 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
| 551 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm);
|
| 552 |
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm);
|
| 553 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
|
| 554 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
|
| 555 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
|
|
|
| 1368 |
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
| 1369 |
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
| 1370 |
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
|
| 1371 |
+
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break;
|
| 1372 |
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
| 1373 |
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
| 1374 |
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
|
|
|
| 1511 |
nth1 = 16;
|
| 1512 |
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
|
| 1513 |
} break;
|
| 1514 |
+
case GGML_TYPE_IQ2_S:
|
| 1515 |
+
{
|
| 1516 |
+
nth0 = 4;
|
| 1517 |
+
nth1 = 16;
|
| 1518 |
+
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
|
| 1519 |
+
} break;
|
| 1520 |
case GGML_TYPE_IQ1_S:
|
| 1521 |
{
|
| 1522 |
nth0 = 4;
|
|
|
|
| 1561 |
[encoder setBytes:&r2 length:sizeof(r2) atIndex:17];
|
| 1562 |
[encoder setBytes:&r3 length:sizeof(r3) atIndex:18];
|
| 1563 |
|
| 1564 |
+
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
| 1565 |
+
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
| 1566 |
+
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ2_S) {
|
| 1567 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 1568 |
}
|
| 1569 |
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
|
|
|
| 1675 |
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
| 1676 |
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
| 1677 |
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
|
| 1678 |
+
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break;
|
| 1679 |
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
|
| 1680 |
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
| 1681 |
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
|
|
|
| 1821 |
nth1 = 16;
|
| 1822 |
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
|
| 1823 |
} break;
|
| 1824 |
+
case GGML_TYPE_IQ2_S:
|
| 1825 |
+
{
|
| 1826 |
+
nth0 = 4;
|
| 1827 |
+
nth1 = 16;
|
| 1828 |
+
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline;
|
| 1829 |
+
} break;
|
| 1830 |
case GGML_TYPE_IQ1_S:
|
| 1831 |
{
|
| 1832 |
nth0 = 4;
|
|
|
|
| 1887 |
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
|
| 1888 |
}
|
| 1889 |
|
| 1890 |
+
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
|
| 1891 |
+
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
|
| 1892 |
+
src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ2_S) {
|
| 1893 |
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 1894 |
}
|
| 1895 |
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
|
|
|
|
| 1949 |
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
| 1950 |
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
| 1951 |
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break;
|
| 1952 |
+
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break;
|
| 1953 |
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
|
| 1954 |
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
| 1955 |
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
ggml-metal.metal
CHANGED
|
@@ -2519,6 +2519,14 @@ typedef struct {
|
|
| 2519 |
} block_iq2_xs;
|
| 2520 |
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
| 2521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2522 |
typedef struct {
|
| 2523 |
half d;
|
| 2524 |
uint8_t qs[3*QK_K/8];
|
|
@@ -3774,6 +3782,265 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
|
|
| 3774 |
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
| 3775 |
};
|
| 3776 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3777 |
constexpr constant static uint32_t iq3xxs_grid[256] = {
|
| 3778 |
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
| 3779 |
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
|
@@ -4572,6 +4839,139 @@ kernel void kernel_mul_mv_iq3_s_f32(
|
|
| 4572 |
kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
| 4573 |
}
|
| 4574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4575 |
void kernel_mul_mv_iq1_s_f32_impl(
|
| 4576 |
device const void * src0,
|
| 4577 |
device const float * src1,
|
|
@@ -5188,6 +5588,25 @@ void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 &
|
|
| 5188 |
}
|
| 5189 |
}
|
| 5190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5191 |
template <typename type4x4>
|
| 5192 |
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
| 5193 |
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
@@ -5762,6 +6181,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
|
|
| 5762 |
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
| 5763 |
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
| 5764 |
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
|
|
| 5765 |
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 5766 |
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
| 5767 |
|
|
@@ -5804,6 +6224,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
|
|
| 5804 |
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
| 5805 |
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
| 5806 |
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
|
|
| 5807 |
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 5808 |
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
| 5809 |
|
|
@@ -5858,6 +6279,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
|
|
| 5858 |
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
| 5859 |
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
| 5860 |
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
|
|
| 5861 |
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 5862 |
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
| 5863 |
|
|
@@ -6893,6 +7315,71 @@ kernel void kernel_mul_mv_id_iq3_s_f32(
|
|
| 6893 |
sgitg);
|
| 6894 |
}
|
| 6895 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6896 |
[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
|
| 6897 |
kernel void kernel_mul_mv_id_iq1_s_f32(
|
| 6898 |
device const char * ids,
|
|
|
|
| 2519 |
} block_iq2_xs;
|
| 2520 |
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
| 2521 |
|
| 2522 |
+
// 2.5625 bpw quants
|
| 2523 |
+
typedef struct {
|
| 2524 |
+
half d;
|
| 2525 |
+
uint8_t qs[QK_K/4];
|
| 2526 |
+
uint8_t qh[QK_K/32];
|
| 2527 |
+
uint8_t scales[QK_K/32];
|
| 2528 |
+
} block_iq2_s;
|
| 2529 |
+
|
| 2530 |
typedef struct {
|
| 2531 |
half d;
|
| 2532 |
uint8_t qs[3*QK_K/8];
|
|
|
|
| 3782 |
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
| 3783 |
};
|
| 3784 |
|
| 3785 |
+
constexpr constant static uint64_t iq2s_grid[1024] = {
|
| 3786 |
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
| 3787 |
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
| 3788 |
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
| 3789 |
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
| 3790 |
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
| 3791 |
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
| 3792 |
+
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
| 3793 |
+
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
| 3794 |
+
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
| 3795 |
+
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
| 3796 |
+
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
| 3797 |
+
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
| 3798 |
+
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
| 3799 |
+
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
| 3800 |
+
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
| 3801 |
+
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
| 3802 |
+
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
| 3803 |
+
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
| 3804 |
+
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
| 3805 |
+
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
| 3806 |
+
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
| 3807 |
+
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
| 3808 |
+
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
| 3809 |
+
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
| 3810 |
+
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
| 3811 |
+
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
| 3812 |
+
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
| 3813 |
+
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
| 3814 |
+
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
| 3815 |
+
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
| 3816 |
+
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
| 3817 |
+
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
| 3818 |
+
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
| 3819 |
+
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
| 3820 |
+
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
| 3821 |
+
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
| 3822 |
+
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
| 3823 |
+
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
| 3824 |
+
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
| 3825 |
+
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
| 3826 |
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
| 3827 |
+
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
| 3828 |
+
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
| 3829 |
+
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
| 3830 |
+
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
| 3831 |
+
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
| 3832 |
+
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
| 3833 |
+
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
| 3834 |
+
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
| 3835 |
+
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
| 3836 |
+
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
| 3837 |
+
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
| 3838 |
+
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
| 3839 |
+
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
| 3840 |
+
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
| 3841 |
+
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
| 3842 |
+
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
| 3843 |
+
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
| 3844 |
+
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
| 3845 |
+
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
| 3846 |
+
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
| 3847 |
+
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
| 3848 |
+
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
| 3849 |
+
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
| 3850 |
+
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
| 3851 |
+
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
| 3852 |
+
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
| 3853 |
+
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
| 3854 |
+
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
| 3855 |
+
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
| 3856 |
+
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
| 3857 |
+
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
| 3858 |
+
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
| 3859 |
+
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
| 3860 |
+
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
| 3861 |
+
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
| 3862 |
+
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
| 3863 |
+
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
| 3864 |
+
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
| 3865 |
+
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
| 3866 |
+
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
| 3867 |
+
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
| 3868 |
+
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
| 3869 |
+
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
| 3870 |
+
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
| 3871 |
+
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
| 3872 |
+
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
| 3873 |
+
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
| 3874 |
+
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
| 3875 |
+
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
| 3876 |
+
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
| 3877 |
+
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
| 3878 |
+
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
| 3879 |
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
| 3880 |
+
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
| 3881 |
+
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
| 3882 |
+
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
| 3883 |
+
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
| 3884 |
+
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
| 3885 |
+
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
| 3886 |
+
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
| 3887 |
+
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
| 3888 |
+
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
| 3889 |
+
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
| 3890 |
+
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
| 3891 |
+
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
| 3892 |
+
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
| 3893 |
+
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
| 3894 |
+
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
| 3895 |
+
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
| 3896 |
+
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
| 3897 |
+
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
| 3898 |
+
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
| 3899 |
+
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
| 3900 |
+
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
| 3901 |
+
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
| 3902 |
+
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
| 3903 |
+
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
| 3904 |
+
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
| 3905 |
+
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
| 3906 |
+
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
| 3907 |
+
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
| 3908 |
+
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
| 3909 |
+
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
| 3910 |
+
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
| 3911 |
+
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
| 3912 |
+
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
| 3913 |
+
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
| 3914 |
+
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
| 3915 |
+
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
| 3916 |
+
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
| 3917 |
+
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
| 3918 |
+
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
| 3919 |
+
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
| 3920 |
+
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
| 3921 |
+
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
| 3922 |
+
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
| 3923 |
+
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
| 3924 |
+
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
| 3925 |
+
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
| 3926 |
+
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
| 3927 |
+
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
| 3928 |
+
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
| 3929 |
+
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
| 3930 |
+
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
| 3931 |
+
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
| 3932 |
+
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
| 3933 |
+
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
| 3934 |
+
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
| 3935 |
+
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
| 3936 |
+
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
| 3937 |
+
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
| 3938 |
+
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
| 3939 |
+
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
| 3940 |
+
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
| 3941 |
+
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
| 3942 |
+
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
| 3943 |
+
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
| 3944 |
+
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
| 3945 |
+
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
| 3946 |
+
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
| 3947 |
+
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
| 3948 |
+
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
| 3949 |
+
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
| 3950 |
+
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
| 3951 |
+
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
| 3952 |
+
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
| 3953 |
+
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
| 3954 |
+
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
| 3955 |
+
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
| 3956 |
+
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
| 3957 |
+
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
| 3958 |
+
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
| 3959 |
+
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
| 3960 |
+
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
| 3961 |
+
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
| 3962 |
+
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
| 3963 |
+
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
| 3964 |
+
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
| 3965 |
+
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
| 3966 |
+
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
| 3967 |
+
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
| 3968 |
+
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
| 3969 |
+
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
| 3970 |
+
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
| 3971 |
+
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
| 3972 |
+
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
| 3973 |
+
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
| 3974 |
+
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
| 3975 |
+
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
| 3976 |
+
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
| 3977 |
+
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
| 3978 |
+
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
| 3979 |
+
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
| 3980 |
+
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
| 3981 |
+
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
| 3982 |
+
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
| 3983 |
+
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
| 3984 |
+
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
| 3985 |
+
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
| 3986 |
+
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
| 3987 |
+
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
| 3988 |
+
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
| 3989 |
+
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
| 3990 |
+
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
| 3991 |
+
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
| 3992 |
+
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
| 3993 |
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
| 3994 |
+
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
| 3995 |
+
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
| 3996 |
+
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
| 3997 |
+
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
| 3998 |
+
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
| 3999 |
+
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
| 4000 |
+
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
| 4001 |
+
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
| 4002 |
+
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
| 4003 |
+
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
| 4004 |
+
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
| 4005 |
+
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
| 4006 |
+
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
| 4007 |
+
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
| 4008 |
+
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
| 4009 |
+
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
| 4010 |
+
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
| 4011 |
+
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
| 4012 |
+
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
| 4013 |
+
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
| 4014 |
+
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
| 4015 |
+
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
| 4016 |
+
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
| 4017 |
+
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
| 4018 |
+
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
| 4019 |
+
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
| 4020 |
+
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
| 4021 |
+
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
| 4022 |
+
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
| 4023 |
+
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
| 4024 |
+
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
| 4025 |
+
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
| 4026 |
+
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
| 4027 |
+
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
| 4028 |
+
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
| 4029 |
+
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
| 4030 |
+
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
| 4031 |
+
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
| 4032 |
+
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
| 4033 |
+
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
| 4034 |
+
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
| 4035 |
+
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
| 4036 |
+
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
| 4037 |
+
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
| 4038 |
+
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
| 4039 |
+
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
| 4040 |
+
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
| 4041 |
+
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
| 4042 |
+
};
|
| 4043 |
+
|
| 4044 |
constexpr constant static uint32_t iq3xxs_grid[256] = {
|
| 4045 |
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
| 4046 |
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
|
|
|
| 4839 |
kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
| 4840 |
}
|
| 4841 |
|
| 4842 |
+
void kernel_mul_mv_iq2_s_f32_impl(
|
| 4843 |
+
device const void * src0,
|
| 4844 |
+
device const float * src1,
|
| 4845 |
+
device float * dst,
|
| 4846 |
+
constant int64_t & ne00,
|
| 4847 |
+
constant int64_t & ne01,
|
| 4848 |
+
constant int64_t & ne02,
|
| 4849 |
+
constant int64_t & ne10,
|
| 4850 |
+
constant int64_t & ne12,
|
| 4851 |
+
constant int64_t & ne0,
|
| 4852 |
+
constant int64_t & ne1,
|
| 4853 |
+
constant uint & r2,
|
| 4854 |
+
constant uint & r3,
|
| 4855 |
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
| 4856 |
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 4857 |
+
uint tiisg[[thread_index_in_simdgroup]],
|
| 4858 |
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
| 4859 |
+
|
| 4860 |
+
const int nb = ne00/QK_K;
|
| 4861 |
+
const int r0 = tgpig.x;
|
| 4862 |
+
const int r1 = tgpig.y;
|
| 4863 |
+
const int im = tgpig.z;
|
| 4864 |
+
|
| 4865 |
+
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
| 4866 |
+
const int ib_row = first_row * nb;
|
| 4867 |
+
|
| 4868 |
+
const uint i12 = im%ne12;
|
| 4869 |
+
const uint i13 = im/ne12;
|
| 4870 |
+
|
| 4871 |
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 4872 |
+
|
| 4873 |
+
device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
|
| 4874 |
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 4875 |
+
|
| 4876 |
+
float yl[32];
|
| 4877 |
+
float sumf[N_DST]={0.f}, all_sum;
|
| 4878 |
+
|
| 4879 |
+
const int nb32 = nb * (QK_K / 32);
|
| 4880 |
+
|
| 4881 |
+
//threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
|
| 4882 |
+
//{
|
| 4883 |
+
// int nval = 32;
|
| 4884 |
+
// int pos = (32*sgitg + tiisg)*nval;
|
| 4885 |
+
// for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
|
| 4886 |
+
// threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 4887 |
+
//}
|
| 4888 |
+
|
| 4889 |
+
const int ix = tiisg;
|
| 4890 |
+
|
| 4891 |
+
device const float * y4 = y + 32 * ix;
|
| 4892 |
+
|
| 4893 |
+
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
| 4894 |
+
|
| 4895 |
+
for (int i = 0; i < 32; ++i) {
|
| 4896 |
+
yl[i] = y4[i];
|
| 4897 |
+
}
|
| 4898 |
+
|
| 4899 |
+
const int ibl = ib32 / (QK_K / 32);
|
| 4900 |
+
const int ib = ib32 % (QK_K / 32);
|
| 4901 |
+
|
| 4902 |
+
device const block_iq2_s * xr = x + ibl;
|
| 4903 |
+
device const uint8_t * qs = xr->qs + 4 * ib;
|
| 4904 |
+
device const uint8_t * qh = xr->qh + ib;
|
| 4905 |
+
device const uint8_t * sc = xr->scales + ib;
|
| 4906 |
+
device const uint8_t * signs = qs + QK_K/8;
|
| 4907 |
+
device const half * dh = &xr->d;
|
| 4908 |
+
|
| 4909 |
+
for (int row = 0; row < N_DST; row++) {
|
| 4910 |
+
|
| 4911 |
+
const float db = dh[0];
|
| 4912 |
+
const float d1 = db * (0.5f + (sc[0] & 0xf));
|
| 4913 |
+
const float d2 = db * (0.5f + (sc[0] >> 4));
|
| 4914 |
+
|
| 4915 |
+
float2 sum = {0};
|
| 4916 |
+
for (int l = 0; l < 2; ++l) {
|
| 4917 |
+
//const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
| 4918 |
+
//const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
| 4919 |
+
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
| 4920 |
+
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
| 4921 |
+
for (int j = 0; j < 8; ++j) {
|
| 4922 |
+
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
|
| 4923 |
+
sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
|
| 4924 |
+
}
|
| 4925 |
+
}
|
| 4926 |
+
sumf[row] += d1 * sum[0] + d2 * sum[1];
|
| 4927 |
+
|
| 4928 |
+
dh += nb*sizeof(block_iq2_s)/2;
|
| 4929 |
+
qs += nb*sizeof(block_iq2_s);
|
| 4930 |
+
qh += nb*sizeof(block_iq2_s);
|
| 4931 |
+
sc += nb*sizeof(block_iq2_s);
|
| 4932 |
+
signs += nb*sizeof(block_iq2_s);
|
| 4933 |
+
}
|
| 4934 |
+
|
| 4935 |
+
y4 += 32 * 32;
|
| 4936 |
+
}
|
| 4937 |
+
|
| 4938 |
+
for (int row = 0; row < N_DST; ++row) {
|
| 4939 |
+
all_sum = simd_sum(sumf[row]);
|
| 4940 |
+
if (tiisg == 0) {
|
| 4941 |
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
|
| 4942 |
+
}
|
| 4943 |
+
}
|
| 4944 |
+
}
|
| 4945 |
+
|
| 4946 |
+
[[host_name("kernel_mul_mv_iq2_s_f32")]]
|
| 4947 |
+
kernel void kernel_mul_mv_iq2_s_f32(
|
| 4948 |
+
device const void * src0,
|
| 4949 |
+
device const float * src1,
|
| 4950 |
+
device float * dst,
|
| 4951 |
+
constant int64_t & ne00,
|
| 4952 |
+
constant int64_t & ne01,
|
| 4953 |
+
constant int64_t & ne02,
|
| 4954 |
+
constant uint64_t & nb00,
|
| 4955 |
+
constant uint64_t & nb01,
|
| 4956 |
+
constant uint64_t & nb02,
|
| 4957 |
+
constant int64_t & ne10,
|
| 4958 |
+
constant int64_t & ne11,
|
| 4959 |
+
constant int64_t & ne12,
|
| 4960 |
+
constant uint64_t & nb10,
|
| 4961 |
+
constant uint64_t & nb11,
|
| 4962 |
+
constant uint64_t & nb12,
|
| 4963 |
+
constant int64_t & ne0,
|
| 4964 |
+
constant int64_t & ne1,
|
| 4965 |
+
constant uint & r2,
|
| 4966 |
+
constant uint & r3,
|
| 4967 |
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
| 4968 |
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 4969 |
+
uint tiisg[[thread_index_in_simdgroup]],
|
| 4970 |
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
| 4971 |
+
|
| 4972 |
+
kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
| 4973 |
+
}
|
| 4974 |
+
|
| 4975 |
void kernel_mul_mv_iq1_s_f32_impl(
|
| 4976 |
device const void * src0,
|
| 4977 |
device const float * src1,
|
|
|
|
| 5588 |
}
|
| 5589 |
}
|
| 5590 |
|
| 5591 |
+
template <typename type4x4>
|
| 5592 |
+
void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
|
| 5593 |
+
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
| 5594 |
+
const float d = xb->d;
|
| 5595 |
+
const int ib32 = il/2;
|
| 5596 |
+
il = il%2;
|
| 5597 |
+
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
| 5598 |
+
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
| 5599 |
+
device const uint8_t * signs = qs + QK_K/8;
|
| 5600 |
+
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
| 5601 |
+
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
| 5602 |
+
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
|
| 5603 |
+
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
|
| 5604 |
+
for (int i = 0; i < 8; ++i) {
|
| 5605 |
+
reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
|
| 5606 |
+
reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
|
| 5607 |
+
}
|
| 5608 |
+
}
|
| 5609 |
+
|
| 5610 |
template <typename type4x4>
|
| 5611 |
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
| 5612 |
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
|
|
| 6181 |
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
| 6182 |
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
| 6183 |
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
| 6184 |
+
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
| 6185 |
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 6186 |
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
| 6187 |
|
|
|
|
| 6224 |
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
| 6225 |
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
| 6226 |
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
| 6227 |
+
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
| 6228 |
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 6229 |
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
| 6230 |
|
|
|
|
| 6279 |
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
| 6280 |
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
| 6281 |
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
| 6282 |
+
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
| 6283 |
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 6284 |
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
| 6285 |
|
|
|
|
| 7315 |
sgitg);
|
| 7316 |
}
|
| 7317 |
|
| 7318 |
+
[[host_name("kernel_mul_mv_id_iq2_s_f32")]]
|
| 7319 |
+
kernel void kernel_mul_mv_id_iq2_s_f32(
|
| 7320 |
+
device const char * ids,
|
| 7321 |
+
device const char * src1,
|
| 7322 |
+
device float * dst,
|
| 7323 |
+
constant uint64_t & nbi1,
|
| 7324 |
+
constant int64_t & ne00,
|
| 7325 |
+
constant int64_t & ne01,
|
| 7326 |
+
constant int64_t & ne02,
|
| 7327 |
+
constant uint64_t & nb00,
|
| 7328 |
+
constant uint64_t & nb01,
|
| 7329 |
+
constant uint64_t & nb02,
|
| 7330 |
+
constant int64_t & ne10,
|
| 7331 |
+
constant int64_t & ne11,
|
| 7332 |
+
constant int64_t & ne12,
|
| 7333 |
+
constant int64_t & ne13,
|
| 7334 |
+
constant uint64_t & nb10,
|
| 7335 |
+
constant uint64_t & nb11,
|
| 7336 |
+
constant uint64_t & nb12,
|
| 7337 |
+
constant int64_t & ne0,
|
| 7338 |
+
constant int64_t & ne1,
|
| 7339 |
+
constant uint64_t & nb1,
|
| 7340 |
+
constant uint & r2,
|
| 7341 |
+
constant uint & r3,
|
| 7342 |
+
constant int & idx,
|
| 7343 |
+
device const char * src00,
|
| 7344 |
+
device const char * src01,
|
| 7345 |
+
device const char * src02,
|
| 7346 |
+
device const char * src03,
|
| 7347 |
+
device const char * src04,
|
| 7348 |
+
device const char * src05,
|
| 7349 |
+
device const char * src06,
|
| 7350 |
+
device const char * src07,
|
| 7351 |
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
| 7352 |
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 7353 |
+
uint tiitg[[thread_index_in_threadgroup]],
|
| 7354 |
+
uint tiisg[[thread_index_in_simdgroup]],
|
| 7355 |
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
| 7356 |
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
| 7357 |
+
|
| 7358 |
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
| 7359 |
+
|
| 7360 |
+
tgpig.z = tgpig.z%(ne12*ne13);
|
| 7361 |
+
|
| 7362 |
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
| 7363 |
+
|
| 7364 |
+
kernel_mul_mv_iq2_s_f32_impl(
|
| 7365 |
+
src0[id],
|
| 7366 |
+
(device const float *) (src1 + bid*nb11),
|
| 7367 |
+
dst + bid*ne0,
|
| 7368 |
+
ne00,
|
| 7369 |
+
ne01,
|
| 7370 |
+
ne02,
|
| 7371 |
+
ne10,
|
| 7372 |
+
ne12,
|
| 7373 |
+
ne0,
|
| 7374 |
+
ne1,
|
| 7375 |
+
r2,
|
| 7376 |
+
r3,
|
| 7377 |
+
shared_values,
|
| 7378 |
+
tgpig,
|
| 7379 |
+
tiisg,
|
| 7380 |
+
sgitg);
|
| 7381 |
+
}
|
| 7382 |
+
|
| 7383 |
[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
|
| 7384 |
kernel void kernel_mul_mv_id_iq1_s_f32(
|
| 7385 |
device const char * ids,
|
ggml-quants.c
CHANGED
|
@@ -3495,6 +3495,265 @@ static const uint64_t iq2xs_grid[512] = {
|
|
| 3495 |
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
| 3496 |
};
|
| 3497 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3498 |
static const uint32_t iq3xxs_grid[256] = {
|
| 3499 |
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
| 3500 |
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
|
@@ -3796,6 +4055,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
| 3796 |
}
|
| 3797 |
}
|
| 3798 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3799 |
// ====================== 3.0625 bpw (de)-quantization
|
| 3800 |
|
| 3801 |
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
|
|
@@ -9330,6 +9621,210 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9330 |
#endif
|
| 9331 |
}
|
| 9332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9333 |
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
| 9334 |
assert(n % QK_K == 0);
|
| 9335 |
assert(nrc == 1);
|
|
@@ -9934,22 +10429,25 @@ typedef struct {
|
|
| 9934 |
uint16_t * neighbours;
|
| 9935 |
} iq2_entry_t;
|
| 9936 |
|
| 9937 |
-
static iq2_entry_t iq2_data[
|
|
|
|
| 9938 |
{NULL, NULL, NULL},
|
| 9939 |
{NULL, NULL, NULL},
|
| 9940 |
{NULL, NULL, NULL},
|
| 9941 |
};
|
| 9942 |
|
| 9943 |
static inline int iq2_data_index(enum ggml_type type) {
|
| 9944 |
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
| 9945 |
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
| 9946 |
-
type == GGML_TYPE_IQ2_XS ? 1 :
|
|
|
|
| 9947 |
}
|
| 9948 |
|
| 9949 |
static inline int iq2_grid_size(enum ggml_type type) {
|
| 9950 |
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
| 9951 |
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
| 9952 |
-
type == GGML_TYPE_IQ2_XS ? 512 :
|
|
|
|
| 9953 |
}
|
| 9954 |
|
| 9955 |
static int iq2_compare_func(const void * left, const void * right) {
|
|
@@ -10050,11 +10548,79 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
| 10050 |
41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
|
| 10051 |
42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
|
| 10052 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10053 |
|
| 10054 |
const int kmap_size = 43692;
|
| 10055 |
-
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
|
|
|
| 10056 |
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
| 10057 |
-
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
|
|
|
| 10058 |
uint64_t * kgrid_q2xs;
|
| 10059 |
int * kmap_q2xs;
|
| 10060 |
uint16_t * kneighbors_q2xs;
|
|
@@ -10151,7 +10717,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
| 10151 |
}
|
| 10152 |
|
| 10153 |
void iq2xs_free_impl(enum ggml_type type) {
|
| 10154 |
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
| 10155 |
const int gindex = iq2_data_index(type);
|
| 10156 |
if (iq2_data[gindex].grid) {
|
| 10157 |
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
|
@@ -11557,3 +12123,196 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
|
|
| 11557 |
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
| 11558 |
}
|
| 11559 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3495 |
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
| 3496 |
};
|
| 3497 |
|
| 3498 |
+
static const uint64_t iq2s_grid[1024] = {
|
| 3499 |
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
| 3500 |
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
| 3501 |
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
| 3502 |
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
| 3503 |
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
| 3504 |
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
| 3505 |
+
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
| 3506 |
+
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
| 3507 |
+
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
| 3508 |
+
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
| 3509 |
+
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
| 3510 |
+
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
| 3511 |
+
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
| 3512 |
+
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
| 3513 |
+
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
| 3514 |
+
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
| 3515 |
+
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
| 3516 |
+
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
| 3517 |
+
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
| 3518 |
+
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
| 3519 |
+
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
| 3520 |
+
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
| 3521 |
+
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
| 3522 |
+
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
| 3523 |
+
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
| 3524 |
+
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
| 3525 |
+
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
| 3526 |
+
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
| 3527 |
+
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
| 3528 |
+
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
| 3529 |
+
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
| 3530 |
+
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
| 3531 |
+
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
| 3532 |
+
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
| 3533 |
+
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
| 3534 |
+
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
| 3535 |
+
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
| 3536 |
+
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
| 3537 |
+
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
| 3538 |
+
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
| 3539 |
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
| 3540 |
+
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
| 3541 |
+
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
| 3542 |
+
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
| 3543 |
+
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
| 3544 |
+
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
| 3545 |
+
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
| 3546 |
+
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
| 3547 |
+
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
| 3548 |
+
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
| 3549 |
+
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
| 3550 |
+
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
| 3551 |
+
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
| 3552 |
+
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
| 3553 |
+
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
| 3554 |
+
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
| 3555 |
+
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
| 3556 |
+
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
| 3557 |
+
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
| 3558 |
+
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
| 3559 |
+
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
| 3560 |
+
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
| 3561 |
+
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
| 3562 |
+
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
| 3563 |
+
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
| 3564 |
+
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
| 3565 |
+
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
| 3566 |
+
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
| 3567 |
+
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
| 3568 |
+
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
| 3569 |
+
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
| 3570 |
+
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
| 3571 |
+
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
| 3572 |
+
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
| 3573 |
+
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
| 3574 |
+
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
| 3575 |
+
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
| 3576 |
+
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
| 3577 |
+
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
| 3578 |
+
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
| 3579 |
+
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
| 3580 |
+
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
| 3581 |
+
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
| 3582 |
+
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
| 3583 |
+
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
| 3584 |
+
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
| 3585 |
+
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
| 3586 |
+
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
| 3587 |
+
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
| 3588 |
+
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
| 3589 |
+
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
| 3590 |
+
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
| 3591 |
+
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
| 3592 |
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
| 3593 |
+
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
| 3594 |
+
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
| 3595 |
+
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
| 3596 |
+
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
| 3597 |
+
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
| 3598 |
+
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
| 3599 |
+
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
| 3600 |
+
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
| 3601 |
+
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
| 3602 |
+
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
| 3603 |
+
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
| 3604 |
+
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
| 3605 |
+
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
| 3606 |
+
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
| 3607 |
+
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
| 3608 |
+
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
| 3609 |
+
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
| 3610 |
+
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
| 3611 |
+
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
| 3612 |
+
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
| 3613 |
+
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
| 3614 |
+
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
| 3615 |
+
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
| 3616 |
+
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
| 3617 |
+
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
| 3618 |
+
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
| 3619 |
+
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
| 3620 |
+
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
| 3621 |
+
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
| 3622 |
+
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
| 3623 |
+
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
| 3624 |
+
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
| 3625 |
+
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
| 3626 |
+
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
| 3627 |
+
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
| 3628 |
+
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
| 3629 |
+
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
| 3630 |
+
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
| 3631 |
+
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
| 3632 |
+
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
| 3633 |
+
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
| 3634 |
+
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
| 3635 |
+
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
| 3636 |
+
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
| 3637 |
+
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
| 3638 |
+
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
| 3639 |
+
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
| 3640 |
+
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
| 3641 |
+
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
| 3642 |
+
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
| 3643 |
+
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
| 3644 |
+
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
| 3645 |
+
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
| 3646 |
+
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
| 3647 |
+
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
| 3648 |
+
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
| 3649 |
+
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
| 3650 |
+
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
| 3651 |
+
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
| 3652 |
+
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
| 3653 |
+
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
| 3654 |
+
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
| 3655 |
+
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
| 3656 |
+
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
| 3657 |
+
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
| 3658 |
+
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
| 3659 |
+
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
| 3660 |
+
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
| 3661 |
+
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
| 3662 |
+
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
| 3663 |
+
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
| 3664 |
+
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
| 3665 |
+
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
| 3666 |
+
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
| 3667 |
+
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
| 3668 |
+
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
| 3669 |
+
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
| 3670 |
+
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
| 3671 |
+
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
| 3672 |
+
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
| 3673 |
+
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
| 3674 |
+
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
| 3675 |
+
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
| 3676 |
+
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
| 3677 |
+
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
| 3678 |
+
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
| 3679 |
+
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
| 3680 |
+
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
| 3681 |
+
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
| 3682 |
+
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
| 3683 |
+
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
| 3684 |
+
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
| 3685 |
+
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
| 3686 |
+
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
| 3687 |
+
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
| 3688 |
+
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
| 3689 |
+
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
| 3690 |
+
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
| 3691 |
+
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
| 3692 |
+
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
| 3693 |
+
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
| 3694 |
+
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
| 3695 |
+
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
| 3696 |
+
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
| 3697 |
+
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
| 3698 |
+
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
| 3699 |
+
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
| 3700 |
+
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
| 3701 |
+
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
| 3702 |
+
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
| 3703 |
+
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
| 3704 |
+
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
| 3705 |
+
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
| 3706 |
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
| 3707 |
+
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
| 3708 |
+
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
| 3709 |
+
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
| 3710 |
+
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
| 3711 |
+
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
| 3712 |
+
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
| 3713 |
+
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
| 3714 |
+
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
| 3715 |
+
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
| 3716 |
+
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
| 3717 |
+
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
| 3718 |
+
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
| 3719 |
+
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
| 3720 |
+
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
| 3721 |
+
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
| 3722 |
+
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
| 3723 |
+
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
| 3724 |
+
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
| 3725 |
+
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
| 3726 |
+
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
| 3727 |
+
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
| 3728 |
+
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
| 3729 |
+
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
| 3730 |
+
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
| 3731 |
+
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
| 3732 |
+
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
| 3733 |
+
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
| 3734 |
+
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
| 3735 |
+
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
| 3736 |
+
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
| 3737 |
+
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
| 3738 |
+
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
| 3739 |
+
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
| 3740 |
+
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
| 3741 |
+
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
| 3742 |
+
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
| 3743 |
+
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
| 3744 |
+
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
| 3745 |
+
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
| 3746 |
+
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
| 3747 |
+
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
| 3748 |
+
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
| 3749 |
+
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
| 3750 |
+
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
| 3751 |
+
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
| 3752 |
+
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
| 3753 |
+
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
| 3754 |
+
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
| 3755 |
+
};
|
| 3756 |
+
|
| 3757 |
static const uint32_t iq3xxs_grid[256] = {
|
| 3758 |
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
| 3759 |
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
|
|
|
| 4055 |
}
|
| 4056 |
}
|
| 4057 |
|
| 4058 |
+
// ====================== 2.5625 bpw (de)-quantization
|
| 4059 |
+
|
| 4060 |
+
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
|
| 4061 |
+
assert(k % QK_K == 0);
|
| 4062 |
+
const int nb = k / QK_K;
|
| 4063 |
+
|
| 4064 |
+
float db[2];
|
| 4065 |
+
|
| 4066 |
+
for (int i = 0; i < nb; i++) {
|
| 4067 |
+
|
| 4068 |
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
| 4069 |
+
const uint8_t * qs = x[i].qs;
|
| 4070 |
+
const uint8_t * qh = x[i].qh;
|
| 4071 |
+
const uint8_t * signs = qs + QK_K/8;
|
| 4072 |
+
|
| 4073 |
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
| 4074 |
+
db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
|
| 4075 |
+
db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
|
| 4076 |
+
for (int l = 0; l < 4; ++l) {
|
| 4077 |
+
const float dl = db[l/2];
|
| 4078 |
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
| 4079 |
+
for (int j = 0; j < 8; ++j) {
|
| 4080 |
+
y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 4081 |
+
}
|
| 4082 |
+
y += 8;
|
| 4083 |
+
}
|
| 4084 |
+
qs += 4;
|
| 4085 |
+
signs += 4;
|
| 4086 |
+
}
|
| 4087 |
+
}
|
| 4088 |
+
}
|
| 4089 |
+
|
| 4090 |
// ====================== 3.0625 bpw (de)-quantization
|
| 4091 |
|
| 4092 |
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
|
|
|
|
| 9621 |
#endif
|
| 9622 |
}
|
| 9623 |
|
| 9624 |
+
void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
| 9625 |
+
assert(n % QK_K == 0);
|
| 9626 |
+
assert(nrc == 1);
|
| 9627 |
+
UNUSED(nrc);
|
| 9628 |
+
UNUSED(bx);
|
| 9629 |
+
UNUSED(by);
|
| 9630 |
+
UNUSED(bs);
|
| 9631 |
+
|
| 9632 |
+
const block_iq2_s * restrict x = vx;
|
| 9633 |
+
const block_q8_K * restrict y = vy;
|
| 9634 |
+
|
| 9635 |
+
const int nb = n / QK_K;
|
| 9636 |
+
|
| 9637 |
+
#if defined(__ARM_NEON)
|
| 9638 |
+
|
| 9639 |
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
| 9640 |
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
| 9641 |
+
};
|
| 9642 |
+
|
| 9643 |
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
| 9644 |
+
|
| 9645 |
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
| 9646 |
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
| 9647 |
+
const uint8x16_t m1 = vdupq_n_u8(1);
|
| 9648 |
+
const int32x4_t vzero = vdupq_n_s32(0);
|
| 9649 |
+
|
| 9650 |
+
uint8x16x2_t vs;
|
| 9651 |
+
ggml_int8x16x4_t q2s;
|
| 9652 |
+
ggml_int8x16x4_t q8b;
|
| 9653 |
+
|
| 9654 |
+
float sumf = 0;
|
| 9655 |
+
for (int i = 0; i < nb; ++i) {
|
| 9656 |
+
|
| 9657 |
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
| 9658 |
+
|
| 9659 |
+
const uint8_t * restrict qs = x[i].qs;
|
| 9660 |
+
const uint8_t * restrict qh = x[i].qh;
|
| 9661 |
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
| 9662 |
+
const int8_t * restrict q8 = y[i].qs;
|
| 9663 |
+
|
| 9664 |
+
int sumi1 = 0, sumi2 = 0;
|
| 9665 |
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
| 9666 |
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
| 9667 |
+
q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
|
| 9668 |
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
|
| 9669 |
+
q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
|
| 9670 |
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
|
| 9671 |
+
q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
|
| 9672 |
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
|
| 9673 |
+
q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
|
| 9674 |
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
|
| 9675 |
+
qs += 8;
|
| 9676 |
+
|
| 9677 |
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
| 9678 |
+
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
| 9679 |
+
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
| 9680 |
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
| 9681 |
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
| 9682 |
+
|
| 9683 |
+
q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
|
| 9684 |
+
q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
|
| 9685 |
+
|
| 9686 |
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
| 9687 |
+
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
| 9688 |
+
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
| 9689 |
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
| 9690 |
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
| 9691 |
+
|
| 9692 |
+
signs += 4;
|
| 9693 |
+
|
| 9694 |
+
q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
|
| 9695 |
+
q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
|
| 9696 |
+
|
| 9697 |
+
const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
|
| 9698 |
+
const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
|
| 9699 |
+
const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
|
| 9700 |
+
const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
|
| 9701 |
+
|
| 9702 |
+
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
|
| 9703 |
+
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
|
| 9704 |
+
sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
|
| 9705 |
+
sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
|
| 9706 |
+
}
|
| 9707 |
+
sumf += d*(sumi1 + sumi2);
|
| 9708 |
+
}
|
| 9709 |
+
|
| 9710 |
+
*s = 0.125f * sumf;
|
| 9711 |
+
|
| 9712 |
+
#elif defined(__AVX2__)
|
| 9713 |
+
|
| 9714 |
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
| 9715 |
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
| 9716 |
+
};
|
| 9717 |
+
|
| 9718 |
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
| 9719 |
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
| 9720 |
+
};
|
| 9721 |
+
|
| 9722 |
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
| 9723 |
+
const __m128i m1 = _mm_set1_epi8(1);
|
| 9724 |
+
|
| 9725 |
+
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
| 9726 |
+
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
| 9727 |
+
|
| 9728 |
+
uint64_t aux64;
|
| 9729 |
+
|
| 9730 |
+
__m256 accumf = _mm256_setzero_ps();
|
| 9731 |
+
for (int i = 0; i < nb; ++i) {
|
| 9732 |
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
| 9733 |
+
const uint8_t * restrict qs = x[i].qs;
|
| 9734 |
+
const uint8_t * restrict qh = x[i].qh;
|
| 9735 |
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
| 9736 |
+
const int8_t * restrict q8 = y[i].qs;
|
| 9737 |
+
|
| 9738 |
+
memcpy(&aux64, x[i].scales, 8);
|
| 9739 |
+
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
| 9740 |
+
const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
|
| 9741 |
+
|
| 9742 |
+
__m256i sumi1 = _mm256_setzero_si256();
|
| 9743 |
+
__m256i sumi2 = _mm256_setzero_si256();
|
| 9744 |
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
| 9745 |
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
| 9746 |
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
| 9747 |
+
const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
| 9748 |
+
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
|
| 9749 |
+
iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
| 9750 |
+
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
| 9751 |
+
const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
| 9752 |
+
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
|
| 9753 |
+
iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
| 9754 |
+
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
| 9755 |
+
qs += 8;
|
| 9756 |
+
|
| 9757 |
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
| 9758 |
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
| 9759 |
+
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
| 9760 |
+
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
| 9761 |
+
|
| 9762 |
+
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
| 9763 |
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
| 9764 |
+
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
| 9765 |
+
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
| 9766 |
+
|
| 9767 |
+
signs += 4;
|
| 9768 |
+
|
| 9769 |
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
|
| 9770 |
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
|
| 9771 |
+
|
| 9772 |
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
|
| 9773 |
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
|
| 9774 |
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
| 9775 |
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
| 9776 |
+
}
|
| 9777 |
+
|
| 9778 |
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
| 9779 |
+
|
| 9780 |
+
}
|
| 9781 |
+
|
| 9782 |
+
*s = 0.125f * hsum_float_8(accumf);
|
| 9783 |
+
|
| 9784 |
+
#else
|
| 9785 |
+
|
| 9786 |
+
float sumf = 0;
|
| 9787 |
+
for (int i = 0; i < nb; i++) {
|
| 9788 |
+
|
| 9789 |
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
| 9790 |
+
const int8_t * q8 = y[i].qs;
|
| 9791 |
+
const uint8_t * qs = x[i].qs;
|
| 9792 |
+
const uint8_t * qh = x[i].qh;
|
| 9793 |
+
const uint8_t * signs = qs + QK_K/8;
|
| 9794 |
+
|
| 9795 |
+
int bsum = 0;
|
| 9796 |
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
| 9797 |
+
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
| 9798 |
+
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
| 9799 |
+
int sumi1 = 0, sumi2 = 0;
|
| 9800 |
+
for (int l = 0; l < 2; ++l) {
|
| 9801 |
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
| 9802 |
+
for (int j = 0; j < 8; ++j) {
|
| 9803 |
+
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
| 9804 |
+
}
|
| 9805 |
+
q8 += 8;
|
| 9806 |
+
}
|
| 9807 |
+
for (int l = 2; l < 4; ++l) {
|
| 9808 |
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
| 9809 |
+
for (int j = 0; j < 8; ++j) {
|
| 9810 |
+
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
| 9811 |
+
}
|
| 9812 |
+
q8 += 8;
|
| 9813 |
+
}
|
| 9814 |
+
bsum += ls1 * sumi1 + ls2 * sumi2;
|
| 9815 |
+
qs += 4;
|
| 9816 |
+
signs += 4;
|
| 9817 |
+
}
|
| 9818 |
+
|
| 9819 |
+
sumf += d * bsum;
|
| 9820 |
+
}
|
| 9821 |
+
|
| 9822 |
+
*s = 0.125f * sumf;
|
| 9823 |
+
|
| 9824 |
+
#endif
|
| 9825 |
+
|
| 9826 |
+
}
|
| 9827 |
+
|
| 9828 |
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
| 9829 |
assert(n % QK_K == 0);
|
| 9830 |
assert(nrc == 1);
|
|
|
|
| 10429 |
uint16_t * neighbours;
|
| 10430 |
} iq2_entry_t;
|
| 10431 |
|
| 10432 |
+
static iq2_entry_t iq2_data[4] = {
|
| 10433 |
+
{NULL, NULL, NULL},
|
| 10434 |
{NULL, NULL, NULL},
|
| 10435 |
{NULL, NULL, NULL},
|
| 10436 |
{NULL, NULL, NULL},
|
| 10437 |
};
|
| 10438 |
|
| 10439 |
static inline int iq2_data_index(enum ggml_type type) {
|
| 10440 |
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
| 10441 |
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
| 10442 |
+
type == GGML_TYPE_IQ2_XS ? 1 :
|
| 10443 |
+
type == GGML_TYPE_IQ1_S ? 2 : 3;
|
| 10444 |
}
|
| 10445 |
|
| 10446 |
static inline int iq2_grid_size(enum ggml_type type) {
|
| 10447 |
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
| 10448 |
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
| 10449 |
+
type == GGML_TYPE_IQ2_XS ? 512 :
|
| 10450 |
+
type == GGML_TYPE_IQ1_S ? 512 : 1024;
|
| 10451 |
}
|
| 10452 |
|
| 10453 |
static int iq2_compare_func(const void * left, const void * right) {
|
|
|
|
| 10548 |
41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
|
| 10549 |
42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
|
| 10550 |
};
|
| 10551 |
+
static const uint16_t kgrid_2bit_1024[1024] = {
|
| 10552 |
+
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
| 10553 |
+
73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
|
| 10554 |
+
165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
|
| 10555 |
+
337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
|
| 10556 |
+
517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
|
| 10557 |
+
674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
|
| 10558 |
+
1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
|
| 10559 |
+
1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
|
| 10560 |
+
1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
|
| 10561 |
+
1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
|
| 10562 |
+
2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
|
| 10563 |
+
2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
|
| 10564 |
+
2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
|
| 10565 |
+
4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
|
| 10566 |
+
4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
|
| 10567 |
+
4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
|
| 10568 |
+
4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
|
| 10569 |
+
4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
|
| 10570 |
+
5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
|
| 10571 |
+
5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
|
| 10572 |
+
5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
|
| 10573 |
+
5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
|
| 10574 |
+
6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
|
| 10575 |
+
6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
|
| 10576 |
+
8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
|
| 10577 |
+
8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
|
| 10578 |
+
8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
|
| 10579 |
+
9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
|
| 10580 |
+
9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
|
| 10581 |
+
10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
|
| 10582 |
+
16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
|
| 10583 |
+
16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
|
| 10584 |
+
16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
|
| 10585 |
+
16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
|
| 10586 |
+
17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
|
| 10587 |
+
17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
|
| 10588 |
+
17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
|
| 10589 |
+
17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
|
| 10590 |
+
18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
|
| 10591 |
+
18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
|
| 10592 |
+
18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
|
| 10593 |
+
20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
|
| 10594 |
+
20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
|
| 10595 |
+
20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
|
| 10596 |
+
21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
|
| 10597 |
+
21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
|
| 10598 |
+
22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
|
| 10599 |
+
22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
|
| 10600 |
+
24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
|
| 10601 |
+
24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
|
| 10602 |
+
25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
|
| 10603 |
+
26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
|
| 10604 |
+
32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
|
| 10605 |
+
33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
|
| 10606 |
+
33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
|
| 10607 |
+
33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
|
| 10608 |
+
34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
|
| 10609 |
+
35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
|
| 10610 |
+
36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
|
| 10611 |
+
37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
|
| 10612 |
+
38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
|
| 10613 |
+
39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
|
| 10614 |
+
41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
|
| 10615 |
+
42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
|
| 10616 |
+
};
|
| 10617 |
|
| 10618 |
const int kmap_size = 43692;
|
| 10619 |
+
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
| 10620 |
+
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
| 10621 |
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
| 10622 |
+
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
| 10623 |
+
type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
|
| 10624 |
uint64_t * kgrid_q2xs;
|
| 10625 |
int * kmap_q2xs;
|
| 10626 |
uint16_t * kneighbors_q2xs;
|
|
|
|
| 10717 |
}
|
| 10718 |
|
| 10719 |
void iq2xs_free_impl(enum ggml_type type) {
|
| 10720 |
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
| 10721 |
const int gindex = iq2_data_index(type);
|
| 10722 |
if (iq2_data[gindex].grid) {
|
| 10723 |
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
|
|
|
| 12123 |
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
| 12124 |
}
|
| 12125 |
|
| 12126 |
+
// =============================== 2.5625 bpw
|
| 12127 |
+
|
| 12128 |
+
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
| 12129 |
+
|
| 12130 |
+
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
| 12131 |
+
|
| 12132 |
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
| 12133 |
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
| 12134 |
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
| 12135 |
+
|
| 12136 |
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
| 12137 |
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
| 12138 |
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
| 12139 |
+
GGML_ASSERT(n%QK_K == 0);
|
| 12140 |
+
|
| 12141 |
+
const int kMaxQ = 3;
|
| 12142 |
+
|
| 12143 |
+
const int nbl = n/256;
|
| 12144 |
+
|
| 12145 |
+
block_iq2_s * y = vy;
|
| 12146 |
+
|
| 12147 |
+
float scales[QK_K/16];
|
| 12148 |
+
float weight[16];
|
| 12149 |
+
float xval[16];
|
| 12150 |
+
int8_t L[16];
|
| 12151 |
+
int8_t Laux[16];
|
| 12152 |
+
float waux[16];
|
| 12153 |
+
bool is_on_grid[2];
|
| 12154 |
+
bool is_on_grid_aux[2];
|
| 12155 |
+
uint8_t block_signs[2];
|
| 12156 |
+
|
| 12157 |
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
| 12158 |
+
|
| 12159 |
+
memset(&y[ibl], 0, sizeof(block_iq2_s));
|
| 12160 |
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
| 12161 |
+
|
| 12162 |
+
float max_scale = 0;
|
| 12163 |
+
|
| 12164 |
+
const float * xbl = x + QK_K*ibl;
|
| 12165 |
+
float sumx2 = 0;
|
| 12166 |
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
| 12167 |
+
float sigma2 = 2*sumx2/QK_K;
|
| 12168 |
+
|
| 12169 |
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
| 12170 |
+
const float * xb = xbl + 16*ib;
|
| 12171 |
+
if (quant_weights) {
|
| 12172 |
+
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
| 12173 |
+
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
| 12174 |
+
} else {
|
| 12175 |
+
for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
|
| 12176 |
+
}
|
| 12177 |
+
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
| 12178 |
+
for (int k = 0; k < 2; ++k) {
|
| 12179 |
+
uint8_t s = 0;
|
| 12180 |
+
for (int i = 0; i < 8; ++i) {
|
| 12181 |
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
| 12182 |
+
else {
|
| 12183 |
+
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
| 12184 |
+
}
|
| 12185 |
+
}
|
| 12186 |
+
block_signs[k] = s;
|
| 12187 |
+
}
|
| 12188 |
+
float max = xval[0];
|
| 12189 |
+
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
| 12190 |
+
if (!max) {
|
| 12191 |
+
scales[ib] = 0;
|
| 12192 |
+
continue;
|
| 12193 |
+
}
|
| 12194 |
+
float best = 0;
|
| 12195 |
+
float scale = max/(2*kMaxQ-1);
|
| 12196 |
+
is_on_grid[0] = is_on_grid[1] = true;
|
| 12197 |
+
for (int is = -9; is <= 9; ++is) {
|
| 12198 |
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
| 12199 |
+
float this_scale = 1/id;
|
| 12200 |
+
for (int k = 0; k < 2; ++k) {
|
| 12201 |
+
for (int i = 0; i < 8; ++i) {
|
| 12202 |
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
| 12203 |
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
| 12204 |
+
}
|
| 12205 |
+
uint16_t u = 0;
|
| 12206 |
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
| 12207 |
+
int grid_index = kmap_q2xs[u];
|
| 12208 |
+
is_on_grid_aux[k] = true;
|
| 12209 |
+
if (grid_index < 0) {
|
| 12210 |
+
is_on_grid_aux[k] = false;
|
| 12211 |
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
| 12212 |
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
| 12213 |
+
}
|
| 12214 |
+
}
|
| 12215 |
+
float sumqx = 0, sumq2 = 0;
|
| 12216 |
+
for (int i = 0; i < 16; ++i) {
|
| 12217 |
+
float w = weight[i];
|
| 12218 |
+
float q = 2*Laux[i] + 1;
|
| 12219 |
+
sumqx += w*xval[i]*q;
|
| 12220 |
+
sumq2 += w*q*q;
|
| 12221 |
+
}
|
| 12222 |
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
| 12223 |
+
scale = sumqx/sumq2; best = scale*sumqx;
|
| 12224 |
+
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
| 12225 |
+
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
| 12226 |
+
}
|
| 12227 |
+
}
|
| 12228 |
+
int n_not_ongrid = 0;
|
| 12229 |
+
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
| 12230 |
+
if (n_not_ongrid > 0 && scale > 0) {
|
| 12231 |
+
float id = 1/scale;
|
| 12232 |
+
for (int k = 0; k < 2; ++k) {
|
| 12233 |
+
if (is_on_grid[k]) continue;
|
| 12234 |
+
uint16_t u = 0;
|
| 12235 |
+
for (int i = 0; i < 8; ++i) {
|
| 12236 |
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
| 12237 |
+
l = MAX(0, MIN(kMaxQ-1, l));
|
| 12238 |
+
u |= (l << 2*i);
|
| 12239 |
+
L[8*k + i] = l;
|
| 12240 |
+
}
|
| 12241 |
+
int grid_index = kmap_q2xs[u];
|
| 12242 |
+
if (grid_index < 0) {
|
| 12243 |
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
| 12244 |
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
| 12245 |
+
}
|
| 12246 |
+
}
|
| 12247 |
+
float sumqx = 0, sumq2 = 0;
|
| 12248 |
+
for (int i = 0; i < 16; ++i) {
|
| 12249 |
+
float w = weight[i];
|
| 12250 |
+
float q = 2*L[i] + 1;
|
| 12251 |
+
sumqx += w*xval[i]*q;
|
| 12252 |
+
sumq2 += w*q*q;
|
| 12253 |
+
}
|
| 12254 |
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
| 12255 |
+
}
|
| 12256 |
+
if (scale < 0) {
|
| 12257 |
+
scale = -scale;
|
| 12258 |
+
for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
|
| 12259 |
+
}
|
| 12260 |
+
for (int k = 0; k < 2; ++k) {
|
| 12261 |
+
uint16_t u = 0;
|
| 12262 |
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
| 12263 |
+
int grid_index = kmap_q2xs[u];
|
| 12264 |
+
if (grid_index < 0) {
|
| 12265 |
+
printf("Oops: found point %u not on grid:", u);
|
| 12266 |
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
| 12267 |
+
printf("\n");
|
| 12268 |
+
GGML_ASSERT(false);
|
| 12269 |
+
}
|
| 12270 |
+
const int i8 = 2*ib + k;
|
| 12271 |
+
y[ibl].qs[i8] = grid_index & 255;
|
| 12272 |
+
y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
|
| 12273 |
+
y[ibl].qs[QK_K/8 + i8] = block_signs[k];
|
| 12274 |
+
}
|
| 12275 |
+
GGML_ASSERT(scale >= 0);
|
| 12276 |
+
scales[ib] = scale;
|
| 12277 |
+
max_scale = MAX(max_scale, scale);
|
| 12278 |
+
}
|
| 12279 |
+
|
| 12280 |
+
if (!max_scale) {
|
| 12281 |
+
continue;
|
| 12282 |
+
}
|
| 12283 |
+
|
| 12284 |
+
float d = max_scale/31;
|
| 12285 |
+
y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
|
| 12286 |
+
float id = 1/d;
|
| 12287 |
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
| 12288 |
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
| 12289 |
+
l = MAX(0, MIN(15, l));
|
| 12290 |
+
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
| 12291 |
+
else y[ibl].scales[ib/2] |= (l << 4);
|
| 12292 |
+
}
|
| 12293 |
+
}
|
| 12294 |
+
}
|
| 12295 |
+
|
| 12296 |
+
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 12297 |
+
(void)hist;
|
| 12298 |
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 12299 |
+
int nblock = n_per_row/QK_K;
|
| 12300 |
+
char * qrow = (char *)dst;
|
| 12301 |
+
for (int row = 0; row < nrow; ++row) {
|
| 12302 |
+
quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
|
| 12303 |
+
src += n_per_row;
|
| 12304 |
+
qrow += nblock*sizeof(block_iq2_s);
|
| 12305 |
+
}
|
| 12306 |
+
return nrow * nblock * sizeof(block_iq2_s);
|
| 12307 |
+
}
|
| 12308 |
+
|
| 12309 |
+
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
| 12310 |
+
assert(k % QK_K == 0);
|
| 12311 |
+
quantize_iq2_s(x, y, 1, k, NULL, NULL);
|
| 12312 |
+
}
|
| 12313 |
+
|
| 12314 |
+
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
| 12315 |
+
assert(k % QK_K == 0);
|
| 12316 |
+
block_iq2_s * restrict y = vy;
|
| 12317 |
+
quantize_row_iq2_s_reference(x, y, k);
|
| 12318 |
+
}
|
ggml-quants.h
CHANGED
|
@@ -182,6 +182,15 @@ typedef struct {
|
|
| 182 |
} block_iq2_xs;
|
| 183 |
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
| 184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
// (Almost) "true" 3-bit quantization.
|
| 186 |
// Due to the need to use blocks as per ggml design, it ends up using
|
| 187 |
// 3.0625 bpw because of the 16-bit scale for each block of 256.
|
|
@@ -242,6 +251,7 @@ void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGM
|
|
| 242 |
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
| 243 |
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
| 244 |
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
|
|
|
| 245 |
|
| 246 |
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 247 |
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
@@ -259,6 +269,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|
| 259 |
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 260 |
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 261 |
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
| 262 |
|
| 263 |
// Dequantization
|
| 264 |
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
@@ -276,6 +287,7 @@ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRI
|
|
| 276 |
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 277 |
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 278 |
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
| 279 |
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 280 |
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 281 |
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
@@ -295,6 +307,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
| 295 |
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 296 |
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 297 |
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
|
|
| 298 |
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 299 |
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 300 |
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
@@ -305,6 +318,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 305 |
//
|
| 306 |
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 307 |
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
|
|
| 308 |
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 309 |
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 310 |
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
|
|
| 182 |
} block_iq2_xs;
|
| 183 |
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
| 184 |
|
| 185 |
+
// 2.5625 bpw quants
|
| 186 |
+
typedef struct {
|
| 187 |
+
ggml_fp16_t d;
|
| 188 |
+
uint8_t qs[QK_K/4];
|
| 189 |
+
uint8_t qh[QK_K/32];
|
| 190 |
+
uint8_t scales[QK_K/32];
|
| 191 |
+
} block_iq2_s;
|
| 192 |
+
static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
|
| 193 |
+
|
| 194 |
// (Almost) "true" 3-bit quantization.
|
| 195 |
// Due to the need to use blocks as per ggml design, it ends up using
|
| 196 |
// 3.0625 bpw because of the 16-bit scale for each block of 256.
|
|
|
|
| 251 |
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
| 252 |
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
| 253 |
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
| 254 |
+
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
|
| 255 |
|
| 256 |
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 257 |
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
| 269 |
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 270 |
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 271 |
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 272 |
+
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 273 |
|
| 274 |
// Dequantization
|
| 275 |
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
| 287 |
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 288 |
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 289 |
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 290 |
+
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 291 |
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 292 |
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 293 |
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
| 307 |
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 308 |
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 309 |
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 310 |
+
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 311 |
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 312 |
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 313 |
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
|
|
| 318 |
//
|
| 319 |
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 320 |
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 321 |
+
size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 322 |
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 323 |
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 324 |
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
ggml.c
CHANGED
|
@@ -694,6 +694,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 694 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 695 |
.nrows = 1,
|
| 696 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
[GGML_TYPE_IQ1_S] = {
|
| 698 |
.type_name = "iq1_s",
|
| 699 |
.blck_size = QK_K,
|
|
@@ -2327,6 +2339,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
| 2327 |
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
|
| 2328 |
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
|
| 2329 |
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
|
|
|
| 2330 |
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
| 2331 |
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
| 2332 |
}
|
|
@@ -7764,6 +7777,7 @@ static void ggml_compute_forward_add(
|
|
| 7764 |
case GGML_TYPE_IQ1_S:
|
| 7765 |
case GGML_TYPE_IQ4_NL:
|
| 7766 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 7767 |
{
|
| 7768 |
ggml_compute_forward_add_q_f32(params, dst);
|
| 7769 |
} break;
|
|
@@ -8044,6 +8058,7 @@ static void ggml_compute_forward_add1(
|
|
| 8044 |
case GGML_TYPE_IQ1_S:
|
| 8045 |
case GGML_TYPE_IQ4_NL:
|
| 8046 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 8047 |
{
|
| 8048 |
ggml_compute_forward_add1_q_f32(params, dst);
|
| 8049 |
} break;
|
|
@@ -8169,6 +8184,7 @@ static void ggml_compute_forward_acc(
|
|
| 8169 |
case GGML_TYPE_IQ1_S:
|
| 8170 |
case GGML_TYPE_IQ4_NL:
|
| 8171 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 8172 |
default:
|
| 8173 |
{
|
| 8174 |
GGML_ASSERT(false);
|
|
@@ -11068,6 +11084,7 @@ static void ggml_compute_forward_out_prod(
|
|
| 11068 |
case GGML_TYPE_IQ1_S:
|
| 11069 |
case GGML_TYPE_IQ4_NL:
|
| 11070 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 11071 |
{
|
| 11072 |
ggml_compute_forward_out_prod_q_f32(params, dst);
|
| 11073 |
} break;
|
|
@@ -11257,6 +11274,7 @@ static void ggml_compute_forward_set(
|
|
| 11257 |
case GGML_TYPE_IQ1_S:
|
| 11258 |
case GGML_TYPE_IQ4_NL:
|
| 11259 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 11260 |
default:
|
| 11261 |
{
|
| 11262 |
GGML_ASSERT(false);
|
|
@@ -11460,6 +11478,7 @@ static void ggml_compute_forward_get_rows(
|
|
| 11460 |
case GGML_TYPE_IQ1_S:
|
| 11461 |
case GGML_TYPE_IQ4_NL:
|
| 11462 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 11463 |
{
|
| 11464 |
ggml_compute_forward_get_rows_q(params, dst);
|
| 11465 |
} break;
|
|
@@ -12161,6 +12180,7 @@ static void ggml_compute_forward_alibi(
|
|
| 12161 |
case GGML_TYPE_IQ1_S:
|
| 12162 |
case GGML_TYPE_IQ4_NL:
|
| 12163 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 12164 |
case GGML_TYPE_Q8_K:
|
| 12165 |
case GGML_TYPE_I8:
|
| 12166 |
case GGML_TYPE_I16:
|
|
@@ -12245,6 +12265,7 @@ static void ggml_compute_forward_clamp(
|
|
| 12245 |
case GGML_TYPE_IQ1_S:
|
| 12246 |
case GGML_TYPE_IQ4_NL:
|
| 12247 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 12248 |
case GGML_TYPE_Q8_K:
|
| 12249 |
case GGML_TYPE_I8:
|
| 12250 |
case GGML_TYPE_I16:
|
|
@@ -19500,6 +19521,7 @@ void ggml_quantize_init(enum ggml_type type) {
|
|
| 19500 |
switch (type) {
|
| 19501 |
case GGML_TYPE_IQ2_XXS:
|
| 19502 |
case GGML_TYPE_IQ2_XS:
|
|
|
|
| 19503 |
case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
|
| 19504 |
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
| 19505 |
case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
|
|
@@ -19786,6 +19808,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
| 19786 |
result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 19787 |
GGML_ASSERT(result == row_size * nrows);
|
| 19788 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19789 |
case GGML_TYPE_IQ1_S:
|
| 19790 |
{
|
| 19791 |
GGML_ASSERT(start % QK_K == 0);
|
|
|
|
| 694 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 695 |
.nrows = 1,
|
| 696 |
},
|
| 697 |
+
[GGML_TYPE_IQ2_S] = {
|
| 698 |
+
.type_name = "iq2_s",
|
| 699 |
+
.blck_size = QK_K,
|
| 700 |
+
.type_size = sizeof(block_iq2_s),
|
| 701 |
+
.is_quantized = true,
|
| 702 |
+
.to_float = (ggml_to_float_t) dequantize_row_iq2_s,
|
| 703 |
+
.from_float = quantize_row_iq2_s,
|
| 704 |
+
.from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
|
| 705 |
+
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
|
| 706 |
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 707 |
+
.nrows = 1,
|
| 708 |
+
},
|
| 709 |
[GGML_TYPE_IQ1_S] = {
|
| 710 |
.type_name = "iq1_s",
|
| 711 |
.blck_size = QK_K,
|
|
|
|
| 2339 |
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
|
| 2340 |
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
|
| 2341 |
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
| 2342 |
+
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
| 2343 |
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
| 2344 |
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
| 2345 |
}
|
|
|
|
| 7777 |
case GGML_TYPE_IQ1_S:
|
| 7778 |
case GGML_TYPE_IQ4_NL:
|
| 7779 |
case GGML_TYPE_IQ3_S:
|
| 7780 |
+
case GGML_TYPE_IQ2_S:
|
| 7781 |
{
|
| 7782 |
ggml_compute_forward_add_q_f32(params, dst);
|
| 7783 |
} break;
|
|
|
|
| 8058 |
case GGML_TYPE_IQ1_S:
|
| 8059 |
case GGML_TYPE_IQ4_NL:
|
| 8060 |
case GGML_TYPE_IQ3_S:
|
| 8061 |
+
case GGML_TYPE_IQ2_S:
|
| 8062 |
{
|
| 8063 |
ggml_compute_forward_add1_q_f32(params, dst);
|
| 8064 |
} break;
|
|
|
|
| 8184 |
case GGML_TYPE_IQ1_S:
|
| 8185 |
case GGML_TYPE_IQ4_NL:
|
| 8186 |
case GGML_TYPE_IQ3_S:
|
| 8187 |
+
case GGML_TYPE_IQ2_S:
|
| 8188 |
default:
|
| 8189 |
{
|
| 8190 |
GGML_ASSERT(false);
|
|
|
|
| 11084 |
case GGML_TYPE_IQ1_S:
|
| 11085 |
case GGML_TYPE_IQ4_NL:
|
| 11086 |
case GGML_TYPE_IQ3_S:
|
| 11087 |
+
case GGML_TYPE_IQ2_S:
|
| 11088 |
{
|
| 11089 |
ggml_compute_forward_out_prod_q_f32(params, dst);
|
| 11090 |
} break;
|
|
|
|
| 11274 |
case GGML_TYPE_IQ1_S:
|
| 11275 |
case GGML_TYPE_IQ4_NL:
|
| 11276 |
case GGML_TYPE_IQ3_S:
|
| 11277 |
+
case GGML_TYPE_IQ2_S:
|
| 11278 |
default:
|
| 11279 |
{
|
| 11280 |
GGML_ASSERT(false);
|
|
|
|
| 11478 |
case GGML_TYPE_IQ1_S:
|
| 11479 |
case GGML_TYPE_IQ4_NL:
|
| 11480 |
case GGML_TYPE_IQ3_S:
|
| 11481 |
+
case GGML_TYPE_IQ2_S:
|
| 11482 |
{
|
| 11483 |
ggml_compute_forward_get_rows_q(params, dst);
|
| 11484 |
} break;
|
|
|
|
| 12180 |
case GGML_TYPE_IQ1_S:
|
| 12181 |
case GGML_TYPE_IQ4_NL:
|
| 12182 |
case GGML_TYPE_IQ3_S:
|
| 12183 |
+
case GGML_TYPE_IQ2_S:
|
| 12184 |
case GGML_TYPE_Q8_K:
|
| 12185 |
case GGML_TYPE_I8:
|
| 12186 |
case GGML_TYPE_I16:
|
|
|
|
| 12265 |
case GGML_TYPE_IQ1_S:
|
| 12266 |
case GGML_TYPE_IQ4_NL:
|
| 12267 |
case GGML_TYPE_IQ3_S:
|
| 12268 |
+
case GGML_TYPE_IQ2_S:
|
| 12269 |
case GGML_TYPE_Q8_K:
|
| 12270 |
case GGML_TYPE_I8:
|
| 12271 |
case GGML_TYPE_I16:
|
|
|
|
| 19521 |
switch (type) {
|
| 19522 |
case GGML_TYPE_IQ2_XXS:
|
| 19523 |
case GGML_TYPE_IQ2_XS:
|
| 19524 |
+
case GGML_TYPE_IQ2_S:
|
| 19525 |
case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
|
| 19526 |
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
| 19527 |
case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
|
|
|
|
| 19808 |
result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 19809 |
GGML_ASSERT(result == row_size * nrows);
|
| 19810 |
} break;
|
| 19811 |
+
case GGML_TYPE_IQ2_S:
|
| 19812 |
+
{
|
| 19813 |
+
GGML_ASSERT(start % QK_K == 0);
|
| 19814 |
+
GGML_ASSERT(start % n_per_row == 0);
|
| 19815 |
+
size_t start_row = start / n_per_row;
|
| 19816 |
+
size_t row_size = ggml_row_size(type, n_per_row);
|
| 19817 |
+
result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 19818 |
+
GGML_ASSERT(result == row_size * nrows);
|
| 19819 |
+
} break;
|
| 19820 |
case GGML_TYPE_IQ1_S:
|
| 19821 |
{
|
| 19822 |
GGML_ASSERT(start % QK_K == 0);
|
ggml.h
CHANGED
|
@@ -351,6 +351,7 @@ extern "C" {
|
|
| 351 |
GGML_TYPE_IQ1_S = 19,
|
| 352 |
GGML_TYPE_IQ4_NL = 20,
|
| 353 |
GGML_TYPE_IQ3_S = 21,
|
|
|
|
| 354 |
GGML_TYPE_I8,
|
| 355 |
GGML_TYPE_I16,
|
| 356 |
GGML_TYPE_I32,
|
|
@@ -391,6 +392,7 @@ extern "C" {
|
|
| 391 |
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
| 392 |
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
| 393 |
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
|
|
|
| 394 |
};
|
| 395 |
|
| 396 |
// available tensor operations:
|
|
|
|
| 351 |
GGML_TYPE_IQ1_S = 19,
|
| 352 |
GGML_TYPE_IQ4_NL = 20,
|
| 353 |
GGML_TYPE_IQ3_S = 21,
|
| 354 |
+
GGML_TYPE_IQ2_S = 22,
|
| 355 |
GGML_TYPE_I8,
|
| 356 |
GGML_TYPE_I16,
|
| 357 |
GGML_TYPE_I32,
|
|
|
|
| 392 |
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
| 393 |
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
| 394 |
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
| 395 |
+
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
| 396 |
};
|
| 397 |
|
| 398 |
// available tensor operations:
|