Spaces:
Sleeping
Sleeping
ggml : add IQ2 to test-backend-ops + refactoring (llama/4990)
Browse files* ggml : add IQ2 to test-backend-ops + refactoring
ggml-ci
* cuda : update supports_op for IQ2
ggml-ci
* ci : enable LLAMA_CUBLAS=1 for CUDA nodes
ggml-ci
* cuda : fix out-of-bounds-access in `mul_mat_vec_q`
ggml-ci
* tests : avoid creating RNGs for each Q tensor
ggml-ci
* tests : avoid creating RNGs for each tensor
ggml-ci
- ggml-backend.c +2 -0
- ggml-cuda.cu +9 -3
- ggml-quants.c +30 -44
- ggml-quants.h +3 -0
- ggml.c +31 -3
- ggml.h +16 -4
ggml-backend.c
CHANGED
|
@@ -692,6 +692,8 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
| 692 |
|
| 693 |
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 694 |
switch (op->op) {
|
|
|
|
|
|
|
| 695 |
case GGML_OP_MUL_MAT:
|
| 696 |
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
| 697 |
default:
|
|
|
|
| 692 |
|
| 693 |
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 694 |
switch (op->op) {
|
| 695 |
+
case GGML_OP_CPY:
|
| 696 |
+
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
|
| 697 |
case GGML_OP_MUL_MAT:
|
| 698 |
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
| 699 |
default:
|
ggml-cuda.cu
CHANGED
|
@@ -5131,10 +5131,10 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
| 5131 |
const block_q_t * x = (const block_q_t *) vx;
|
| 5132 |
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 5133 |
|
| 5134 |
-
for (int i =
|
| 5135 |
-
const int ibx = row*blocks_per_row + i
|
| 5136 |
|
| 5137 |
-
const int iby =
|
| 5138 |
|
| 5139 |
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
| 5140 |
|
|
@@ -10918,6 +10918,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 10918 |
if (a->ne[3] != b->ne[3]) {
|
| 10919 |
return false;
|
| 10920 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10921 |
return true;
|
| 10922 |
} break;
|
| 10923 |
case GGML_OP_GET_ROWS:
|
|
|
|
| 5131 |
const block_q_t * x = (const block_q_t *) vx;
|
| 5132 |
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 5133 |
|
| 5134 |
+
for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
|
| 5135 |
+
const int ibx = row*blocks_per_row + i; // x block index
|
| 5136 |
|
| 5137 |
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
| 5138 |
|
| 5139 |
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
| 5140 |
|
|
|
|
| 10918 |
if (a->ne[3] != b->ne[3]) {
|
| 10919 |
return false;
|
| 10920 |
}
|
| 10921 |
+
ggml_type a_type = a->type;
|
| 10922 |
+
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
|
| 10923 |
+
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
| 10924 |
+
return false;
|
| 10925 |
+
}
|
| 10926 |
+
}
|
| 10927 |
return true;
|
| 10928 |
} break;
|
| 10929 |
case GGML_OP_GET_ROWS:
|
ggml-quants.c
CHANGED
|
@@ -1274,7 +1274,12 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
| 1274 |
}
|
| 1275 |
float sumlx = 0;
|
| 1276 |
float suml2 = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1277 |
for (int i = 0; i < n; ++i) {
|
|
|
|
| 1278 |
int l = nearest_int(iscale * x[i]);
|
| 1279 |
l = MAX(-nmax, MIN(nmax-1, l));
|
| 1280 |
L[i] = l + nmax;
|
|
@@ -1649,7 +1654,12 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
|
|
| 1649 |
float max = x[0];
|
| 1650 |
float sum_w = weights ? weights[0] : x[0]*x[0];
|
| 1651 |
float sum_x = sum_w * x[0];
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1652 |
for (int i = 1; i < n; ++i) {
|
|
|
|
| 1653 |
if (x[i] < min) min = x[i];
|
| 1654 |
if (x[i] > max) max = x[i];
|
| 1655 |
float w = weights ? weights[i] : x[i]*x[i];
|
|
@@ -1660,7 +1670,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
|
|
| 1660 |
min = 0;
|
| 1661 |
}
|
| 1662 |
if (max <= min) {
|
| 1663 |
-
|
| 1664 |
*the_min = -min;
|
| 1665 |
return 0.f;
|
| 1666 |
}
|
|
@@ -1862,7 +1872,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
| 1862 |
|
| 1863 |
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 1864 |
(void)hist;
|
| 1865 |
-
|
| 1866 |
if (!quant_weights) {
|
| 1867 |
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
| 1868 |
}
|
|
@@ -2181,7 +2191,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
| 2181 |
|
| 2182 |
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 2183 |
(void)hist;
|
| 2184 |
-
|
| 2185 |
if (!quant_weights) {
|
| 2186 |
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
| 2187 |
}
|
|
@@ -2448,7 +2458,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
| 2448 |
|
| 2449 |
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 2450 |
(void)hist;
|
| 2451 |
-
|
| 2452 |
if (!quant_weights) {
|
| 2453 |
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
| 2454 |
}
|
|
@@ -2771,7 +2781,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
| 2771 |
|
| 2772 |
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 2773 |
(void)hist;
|
| 2774 |
-
|
| 2775 |
if (!quant_weights) {
|
| 2776 |
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
| 2777 |
}
|
|
@@ -3025,7 +3035,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
| 3025 |
|
| 3026 |
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 3027 |
(void)hist;
|
| 3028 |
-
|
| 3029 |
if (!quant_weights) {
|
| 3030 |
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
| 3031 |
}
|
|
@@ -3072,7 +3082,7 @@ size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int
|
|
| 3072 |
if (!quant_weights) {
|
| 3073 |
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
| 3074 |
}
|
| 3075 |
-
|
| 3076 |
char * qrow = (char *)dst;
|
| 3077 |
for (int row = 0; row < nrow; ++row) {
|
| 3078 |
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
|
@@ -3116,7 +3126,7 @@ size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int
|
|
| 3116 |
if (!quant_weights) {
|
| 3117 |
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
| 3118 |
}
|
| 3119 |
-
|
| 3120 |
char * qrow = (char *)dst;
|
| 3121 |
for (int row = 0; row < nrow; ++row) {
|
| 3122 |
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
|
@@ -3169,7 +3179,7 @@ size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int
|
|
| 3169 |
if (!quant_weights) {
|
| 3170 |
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
| 3171 |
}
|
| 3172 |
-
|
| 3173 |
char * qrow = (char *)dst;
|
| 3174 |
for (int row = 0; row < nrow; ++row) {
|
| 3175 |
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
|
@@ -3221,7 +3231,7 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
|
|
| 3221 |
if (!quant_weights) {
|
| 3222 |
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
| 3223 |
}
|
| 3224 |
-
|
| 3225 |
char * qrow = (char *)dst;
|
| 3226 |
for (int row = 0; row < nrow; ++row) {
|
| 3227 |
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
|
@@ -8565,7 +8575,7 @@ static int iq2_compare_func(const void * left, const void * right) {
|
|
| 8565 |
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
| 8566 |
}
|
| 8567 |
|
| 8568 |
-
|
| 8569 |
const int gindex = iq2_data_index(grid_size);
|
| 8570 |
if (iq2_data[gindex].grid) {
|
| 8571 |
return;
|
|
@@ -8720,19 +8730,7 @@ static void q2xs_init_impl(int grid_size) {
|
|
| 8720 |
free(dist2);
|
| 8721 |
}
|
| 8722 |
|
| 8723 |
-
void
|
| 8724 |
-
if (type == GGML_TYPE_IQ2_XXS) {
|
| 8725 |
-
q2xs_init_impl(256);
|
| 8726 |
-
}
|
| 8727 |
-
else if (type == GGML_TYPE_IQ2_XS) {
|
| 8728 |
-
q2xs_init_impl(512);
|
| 8729 |
-
}
|
| 8730 |
-
else {
|
| 8731 |
-
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
| 8732 |
-
}
|
| 8733 |
-
}
|
| 8734 |
-
|
| 8735 |
-
static void q2xs_deinit_impl(int grid_size) {
|
| 8736 |
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
|
| 8737 |
const int gindex = iq2_data_index(grid_size);
|
| 8738 |
if (iq2_data[gindex].grid) {
|
|
@@ -8742,18 +8740,6 @@ static void q2xs_deinit_impl(int grid_size) {
|
|
| 8742 |
}
|
| 8743 |
}
|
| 8744 |
|
| 8745 |
-
void ggml_deinit_iq2_quantization(enum ggml_type type) {
|
| 8746 |
-
if (type == GGML_TYPE_IQ2_XXS) {
|
| 8747 |
-
q2xs_deinit_impl(256);
|
| 8748 |
-
}
|
| 8749 |
-
else if (type == GGML_TYPE_IQ2_XS) {
|
| 8750 |
-
q2xs_deinit_impl(512);
|
| 8751 |
-
}
|
| 8752 |
-
else {
|
| 8753 |
-
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
| 8754 |
-
}
|
| 8755 |
-
}
|
| 8756 |
-
|
| 8757 |
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
| 8758 |
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
| 8759 |
int num_neighbors = neighbours[0];
|
|
@@ -8786,10 +8772,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
| 8786 |
const int * kmap_q2xs = iq2_data[gindex].map;
|
| 8787 |
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
| 8788 |
|
| 8789 |
-
GGML_ASSERT(quant_weights);
|
| 8790 |
-
GGML_ASSERT(kgrid_q2xs);
|
| 8791 |
-
GGML_ASSERT(kmap_q2xs);
|
| 8792 |
-
GGML_ASSERT(kneighbors_q2xs);
|
| 8793 |
GGML_ASSERT(n%QK_K == 0);
|
| 8794 |
|
| 8795 |
const int kMaxQ = 3;
|
|
@@ -9005,10 +8991,10 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
| 9005 |
const int * kmap_q2xs = iq2_data[gindex].map;
|
| 9006 |
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
| 9007 |
|
| 9008 |
-
GGML_ASSERT(quant_weights);
|
| 9009 |
-
GGML_ASSERT(kmap_q2xs);
|
| 9010 |
-
GGML_ASSERT(kgrid_q2xs);
|
| 9011 |
-
GGML_ASSERT(kneighbors_q2xs);
|
| 9012 |
GGML_ASSERT(n%QK_K == 0);
|
| 9013 |
|
| 9014 |
const int kMaxQ = 3;
|
|
|
|
| 1274 |
}
|
| 1275 |
float sumlx = 0;
|
| 1276 |
float suml2 = 0;
|
| 1277 |
+
#ifdef HAVE_BUGGY_APPLE_LINKER
|
| 1278 |
+
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
| 1279 |
+
for (volatile int i = 0; i < n; ++i) {
|
| 1280 |
+
#else
|
| 1281 |
for (int i = 0; i < n; ++i) {
|
| 1282 |
+
#endif
|
| 1283 |
int l = nearest_int(iscale * x[i]);
|
| 1284 |
l = MAX(-nmax, MIN(nmax-1, l));
|
| 1285 |
L[i] = l + nmax;
|
|
|
|
| 1654 |
float max = x[0];
|
| 1655 |
float sum_w = weights ? weights[0] : x[0]*x[0];
|
| 1656 |
float sum_x = sum_w * x[0];
|
| 1657 |
+
#ifdef HAVE_BUGGY_APPLE_LINKER
|
| 1658 |
+
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
| 1659 |
+
for (volatile int i = 1; i < n; ++i) {
|
| 1660 |
+
#else
|
| 1661 |
for (int i = 1; i < n; ++i) {
|
| 1662 |
+
#endif
|
| 1663 |
if (x[i] < min) min = x[i];
|
| 1664 |
if (x[i] > max) max = x[i];
|
| 1665 |
float w = weights ? weights[i] : x[i]*x[i];
|
|
|
|
| 1670 |
min = 0;
|
| 1671 |
}
|
| 1672 |
if (max <= min) {
|
| 1673 |
+
memset(L, 0, n);
|
| 1674 |
*the_min = -min;
|
| 1675 |
return 0.f;
|
| 1676 |
}
|
|
|
|
| 1872 |
|
| 1873 |
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 1874 |
(void)hist;
|
| 1875 |
+
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
| 1876 |
if (!quant_weights) {
|
| 1877 |
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
| 1878 |
}
|
|
|
|
| 2191 |
|
| 2192 |
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 2193 |
(void)hist;
|
| 2194 |
+
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
| 2195 |
if (!quant_weights) {
|
| 2196 |
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
| 2197 |
}
|
|
|
|
| 2458 |
|
| 2459 |
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 2460 |
(void)hist;
|
| 2461 |
+
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
| 2462 |
if (!quant_weights) {
|
| 2463 |
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
| 2464 |
}
|
|
|
|
| 2781 |
|
| 2782 |
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 2783 |
(void)hist;
|
| 2784 |
+
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
| 2785 |
if (!quant_weights) {
|
| 2786 |
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
| 2787 |
}
|
|
|
|
| 3035 |
|
| 3036 |
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 3037 |
(void)hist;
|
| 3038 |
+
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
| 3039 |
if (!quant_weights) {
|
| 3040 |
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
| 3041 |
}
|
|
|
|
| 3082 |
if (!quant_weights) {
|
| 3083 |
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
| 3084 |
}
|
| 3085 |
+
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3086 |
char * qrow = (char *)dst;
|
| 3087 |
for (int row = 0; row < nrow; ++row) {
|
| 3088 |
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
|
|
|
| 3126 |
if (!quant_weights) {
|
| 3127 |
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
| 3128 |
}
|
| 3129 |
+
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3130 |
char * qrow = (char *)dst;
|
| 3131 |
for (int row = 0; row < nrow; ++row) {
|
| 3132 |
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
|
|
|
| 3179 |
if (!quant_weights) {
|
| 3180 |
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
| 3181 |
}
|
| 3182 |
+
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3183 |
char * qrow = (char *)dst;
|
| 3184 |
for (int row = 0; row < nrow; ++row) {
|
| 3185 |
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
|
|
|
| 3231 |
if (!quant_weights) {
|
| 3232 |
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
| 3233 |
}
|
| 3234 |
+
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3235 |
char * qrow = (char *)dst;
|
| 3236 |
for (int row = 0; row < nrow; ++row) {
|
| 3237 |
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
|
|
|
| 8575 |
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
| 8576 |
}
|
| 8577 |
|
| 8578 |
+
void iq2xs_init_impl(int grid_size) {
|
| 8579 |
const int gindex = iq2_data_index(grid_size);
|
| 8580 |
if (iq2_data[gindex].grid) {
|
| 8581 |
return;
|
|
|
|
| 8730 |
free(dist2);
|
| 8731 |
}
|
| 8732 |
|
| 8733 |
+
void iq2xs_free_impl(int grid_size) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8734 |
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
|
| 8735 |
const int gindex = iq2_data_index(grid_size);
|
| 8736 |
if (iq2_data[gindex].grid) {
|
|
|
|
| 8740 |
}
|
| 8741 |
}
|
| 8742 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8743 |
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
| 8744 |
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
| 8745 |
int num_neighbors = neighbours[0];
|
|
|
|
| 8772 |
const int * kmap_q2xs = iq2_data[gindex].map;
|
| 8773 |
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
| 8774 |
|
| 8775 |
+
GGML_ASSERT(quant_weights && "missing quantization weights");
|
| 8776 |
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
| 8777 |
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
| 8778 |
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
| 8779 |
GGML_ASSERT(n%QK_K == 0);
|
| 8780 |
|
| 8781 |
const int kMaxQ = 3;
|
|
|
|
| 8991 |
const int * kmap_q2xs = iq2_data[gindex].map;
|
| 8992 |
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
| 8993 |
|
| 8994 |
+
GGML_ASSERT(quant_weights && "missing quantization weights");
|
| 8995 |
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
| 8996 |
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
| 8997 |
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
| 8998 |
GGML_ASSERT(n%QK_K == 0);
|
| 8999 |
|
| 9000 |
const int kMaxQ = 3;
|
ggml-quants.h
CHANGED
|
@@ -257,3 +257,6 @@ size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row,
|
|
| 257 |
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 258 |
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 259 |
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 258 |
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 259 |
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 260 |
+
|
| 261 |
+
void iq2xs_init_impl(int grid_size);
|
| 262 |
+
void iq2xs_free_impl(int grid_size);
|
ggml.c
CHANGED
|
@@ -18524,6 +18524,28 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
| 18524 |
|
| 18525 |
////////////////////////////////////////////////////////////////////////////////
|
| 18526 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18527 |
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 18528 |
assert(k % QK4_0 == 0);
|
| 18529 |
const int nb = k / QK4_0;
|
|
@@ -18651,9 +18673,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
|
| 18651 |
return (n/QK8_0*sizeof(block_q8_0));
|
| 18652 |
}
|
| 18653 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18654 |
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
| 18655 |
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
| 18656 |
-
(
|
| 18657 |
size_t result = 0;
|
| 18658 |
int n = nrows * n_per_row;
|
| 18659 |
switch (type) {
|
|
@@ -18766,13 +18794,13 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
| 18766 |
} break;
|
| 18767 |
case GGML_TYPE_F16:
|
| 18768 |
{
|
| 18769 |
-
|
| 18770 |
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
|
| 18771 |
result = n * elemsize;
|
| 18772 |
} break;
|
| 18773 |
case GGML_TYPE_F32:
|
| 18774 |
{
|
| 18775 |
-
|
| 18776 |
result = n * elemsize;
|
| 18777 |
memcpy((uint8_t *)dst + start * elemsize, src + start, result);
|
| 18778 |
} break;
|
|
|
|
| 18524 |
|
| 18525 |
////////////////////////////////////////////////////////////////////////////////
|
| 18526 |
|
| 18527 |
+
void ggml_quantize_init(enum ggml_type type) {
|
| 18528 |
+
ggml_critical_section_start();
|
| 18529 |
+
|
| 18530 |
+
switch (type) {
|
| 18531 |
+
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
|
| 18532 |
+
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
|
| 18533 |
+
default: // nothing
|
| 18534 |
+
break;
|
| 18535 |
+
}
|
| 18536 |
+
|
| 18537 |
+
ggml_critical_section_end();
|
| 18538 |
+
}
|
| 18539 |
+
|
| 18540 |
+
void ggml_quantize_free(void) {
|
| 18541 |
+
ggml_critical_section_start();
|
| 18542 |
+
|
| 18543 |
+
iq2xs_free_impl(256);
|
| 18544 |
+
iq2xs_free_impl(512);
|
| 18545 |
+
|
| 18546 |
+
ggml_critical_section_end();
|
| 18547 |
+
}
|
| 18548 |
+
|
| 18549 |
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 18550 |
assert(k % QK4_0 == 0);
|
| 18551 |
const int nb = k / QK4_0;
|
|
|
|
| 18673 |
return (n/QK8_0*sizeof(block_q8_0));
|
| 18674 |
}
|
| 18675 |
|
| 18676 |
+
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
| 18677 |
+
return
|
| 18678 |
+
type == GGML_TYPE_IQ2_XXS ||
|
| 18679 |
+
type == GGML_TYPE_IQ2_XS;
|
| 18680 |
+
}
|
| 18681 |
+
|
| 18682 |
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
| 18683 |
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
| 18684 |
+
ggml_quantize_init(type); // this is noop if already initialized
|
| 18685 |
size_t result = 0;
|
| 18686 |
int n = nrows * n_per_row;
|
| 18687 |
switch (type) {
|
|
|
|
| 18794 |
} break;
|
| 18795 |
case GGML_TYPE_F16:
|
| 18796 |
{
|
| 18797 |
+
size_t elemsize = sizeof(ggml_fp16_t);
|
| 18798 |
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
|
| 18799 |
result = n * elemsize;
|
| 18800 |
} break;
|
| 18801 |
case GGML_TYPE_F32:
|
| 18802 |
{
|
| 18803 |
+
size_t elemsize = sizeof(float);
|
| 18804 |
result = n * elemsize;
|
| 18805 |
memcpy((uint8_t *)dst + start * elemsize, src + start, result);
|
| 18806 |
} break;
|
ggml.h
CHANGED
|
@@ -2065,6 +2065,18 @@ extern "C" {
|
|
| 2065 |
// quantization
|
| 2066 |
//
|
| 2067 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2068 |
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
| 2069 |
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2070 |
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
@@ -2078,13 +2090,13 @@ extern "C" {
|
|
| 2078 |
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2079 |
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2080 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2081 |
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
| 2082 |
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 2083 |
|
| 2084 |
-
// These are needed for IQ2_XS and IQ2_XXS quantizations
|
| 2085 |
-
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
| 2086 |
-
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
| 2087 |
-
|
| 2088 |
//
|
| 2089 |
// gguf
|
| 2090 |
//
|
|
|
|
| 2065 |
// quantization
|
| 2066 |
//
|
| 2067 |
|
| 2068 |
+
// - ggml_quantize_init can be called multiple times with the same type
|
| 2069 |
+
// it will only initialize the quantization tables for the first call or after ggml_quantize_free
|
| 2070 |
+
// automatically called by ggml_quantize_chunk for convenience
|
| 2071 |
+
//
|
| 2072 |
+
// - ggml_quantize_free will free any memory allocated by ggml_quantize_init
|
| 2073 |
+
// call this at the end of the program to avoid memory leaks
|
| 2074 |
+
//
|
| 2075 |
+
// note: these are thread-safe
|
| 2076 |
+
//
|
| 2077 |
+
GGML_API void ggml_quantize_init(enum ggml_type type);
|
| 2078 |
+
GGML_API void ggml_quantize_free(void);
|
| 2079 |
+
|
| 2080 |
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
| 2081 |
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2082 |
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
|
|
| 2090 |
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2091 |
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2092 |
|
| 2093 |
+
// some quantization type cannot be used without an importance matrix
|
| 2094 |
+
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
| 2095 |
+
|
| 2096 |
+
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
| 2097 |
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
| 2098 |
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 2099 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2100 |
//
|
| 2101 |
// gguf
|
| 2102 |
//
|