Spaces:
Sleeping
Sleeping
Add ability to use importance matrix for all k-quants (llama/4930)
Browse files- ggml-quants.c +437 -6
- ggml-quants.h +4 -1
- ggml.c +20 -8
ggml-quants.c
CHANGED
|
@@ -1244,7 +1244,8 @@ static inline int nearest_int(float fval) {
|
|
| 1244 |
return (i & 0x007fffff) - 0x00400000;
|
| 1245 |
}
|
| 1246 |
|
| 1247 |
-
static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type
|
|
|
|
| 1248 |
float max = 0;
|
| 1249 |
float amax = 0;
|
| 1250 |
for (int i = 0; i < n; ++i) {
|
|
@@ -1270,14 +1271,13 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
| 1270 |
rmse_type = -rmse_type;
|
| 1271 |
return_early = true;
|
| 1272 |
}
|
| 1273 |
-
int weight_type = rmse_type%2;
|
| 1274 |
float sumlx = 0;
|
| 1275 |
float suml2 = 0;
|
| 1276 |
for (int i = 0; i < n; ++i) {
|
| 1277 |
int l = nearest_int(iscale * x[i]);
|
| 1278 |
l = MAX(-nmax, MIN(nmax-1, l));
|
| 1279 |
L[i] = l + nmax;
|
| 1280 |
-
float w =
|
| 1281 |
sumlx += w*x[i]*l;
|
| 1282 |
suml2 += w*l*l;
|
| 1283 |
}
|
|
@@ -1293,7 +1293,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
| 1293 |
for (int i = 0; i < n; ++i) {
|
| 1294 |
int l = nearest_int(iscale * x[i]);
|
| 1295 |
l = MAX(-nmax, MIN(nmax-1, l));
|
| 1296 |
-
float w =
|
| 1297 |
sumlx += w*x[i]*l;
|
| 1298 |
suml2 += w*l*l;
|
| 1299 |
}
|
|
@@ -2089,6 +2089,112 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
|
|
| 2089 |
return (n/QK_K*sizeof(block_q3_K));
|
| 2090 |
}
|
| 2091 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2092 |
// ====================== 4-bit (de)-quantization
|
| 2093 |
|
| 2094 |
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
|
|
@@ -2254,6 +2360,108 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
|
|
| 2254 |
return (n/QK_K*sizeof(block_q4_K));
|
| 2255 |
}
|
| 2256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2257 |
// ====================== 5-bit (de)-quantization
|
| 2258 |
|
| 2259 |
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
|
|
@@ -2349,7 +2557,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
| 2349 |
#else
|
| 2350 |
float max_scale = 0, amax = 0;
|
| 2351 |
for (int j = 0; j < QK_K/16; ++j) {
|
| 2352 |
-
scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
|
| 2353 |
float abs_scale = fabsf(scales[j]);
|
| 2354 |
if (abs_scale > amax) {
|
| 2355 |
amax = abs_scale;
|
|
@@ -2460,6 +2668,123 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
|
|
| 2460 |
return (n/QK_K*sizeof(block_q5_K));
|
| 2461 |
}
|
| 2462 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2463 |
// ====================== 6-bit (de)-quantization
|
| 2464 |
|
| 2465 |
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
|
@@ -2476,7 +2801,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
| 2476 |
|
| 2477 |
for (int ib = 0; ib < QK_K/16; ++ib) {
|
| 2478 |
|
| 2479 |
-
const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1);
|
| 2480 |
scales[ib] = scale;
|
| 2481 |
|
| 2482 |
const float abs_scale = fabsf(scale);
|
|
@@ -2608,6 +2933,112 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
|
|
| 2608 |
return (n/QK_K*sizeof(block_q6_K));
|
| 2609 |
}
|
| 2610 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2611 |
// ====================== "True" 2-bit (de)-quantization
|
| 2612 |
|
| 2613 |
static const uint64_t iq2xxs_grid[256] = {
|
|
|
|
| 1244 |
return (i & 0x007fffff) - 0x00400000;
|
| 1245 |
}
|
| 1246 |
|
| 1247 |
+
static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
|
| 1248 |
+
const float * restrict qw) {
|
| 1249 |
float max = 0;
|
| 1250 |
float amax = 0;
|
| 1251 |
for (int i = 0; i < n; ++i) {
|
|
|
|
| 1271 |
rmse_type = -rmse_type;
|
| 1272 |
return_early = true;
|
| 1273 |
}
|
|
|
|
| 1274 |
float sumlx = 0;
|
| 1275 |
float suml2 = 0;
|
| 1276 |
for (int i = 0; i < n; ++i) {
|
| 1277 |
int l = nearest_int(iscale * x[i]);
|
| 1278 |
l = MAX(-nmax, MIN(nmax-1, l));
|
| 1279 |
L[i] = l + nmax;
|
| 1280 |
+
float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
|
| 1281 |
sumlx += w*x[i]*l;
|
| 1282 |
suml2 += w*l*l;
|
| 1283 |
}
|
|
|
|
| 1293 |
for (int i = 0; i < n; ++i) {
|
| 1294 |
int l = nearest_int(iscale * x[i]);
|
| 1295 |
l = MAX(-nmax, MIN(nmax-1, l));
|
| 1296 |
+
float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
|
| 1297 |
sumlx += w*x[i]*l;
|
| 1298 |
suml2 += w*l*l;
|
| 1299 |
}
|
|
|
|
| 2089 |
return (n/QK_K*sizeof(block_q3_K));
|
| 2090 |
}
|
| 2091 |
|
| 2092 |
+
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
|
| 2093 |
+
#if QK_K != 256
|
| 2094 |
+
(void)quant_weights;
|
| 2095 |
+
quantize_row_q3_K_reference(x, y, n_per_row);
|
| 2096 |
+
#else
|
| 2097 |
+
assert(n_per_row % QK_K == 0);
|
| 2098 |
+
const int nb = n_per_row / QK_K;
|
| 2099 |
+
|
| 2100 |
+
int8_t L[QK_K];
|
| 2101 |
+
float scales[QK_K / 16];
|
| 2102 |
+
float weight[16];
|
| 2103 |
+
float sw[QK_K / 16];
|
| 2104 |
+
int8_t Ls[QK_K / 16];
|
| 2105 |
+
|
| 2106 |
+
for (int i = 0; i < nb; i++) {
|
| 2107 |
+
|
| 2108 |
+
float sumx2 = 0;
|
| 2109 |
+
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
| 2110 |
+
float sigma2 = 2*sumx2/QK_K;
|
| 2111 |
+
|
| 2112 |
+
for (int j = 0; j < QK_K/16; ++j) {
|
| 2113 |
+
if (quant_weights) {
|
| 2114 |
+
const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
|
| 2115 |
+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
|
| 2116 |
+
} else {
|
| 2117 |
+
for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
|
| 2118 |
+
}
|
| 2119 |
+
float sumw = 0;
|
| 2120 |
+
for (int l = 0; l < 16; ++l) sumw += weight[l];
|
| 2121 |
+
sw[j] = sumw;
|
| 2122 |
+
|
| 2123 |
+
scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
|
| 2124 |
+
|
| 2125 |
+
}
|
| 2126 |
+
|
| 2127 |
+
memset(y[i].scales, 0, 12);
|
| 2128 |
+
|
| 2129 |
+
float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
|
| 2130 |
+
for (int j = 0; j < QK_K/16; ++j) {
|
| 2131 |
+
int l = Ls[j];
|
| 2132 |
+
if (j < 8) {
|
| 2133 |
+
y[i].scales[j] = l & 0xF;
|
| 2134 |
+
} else {
|
| 2135 |
+
y[i].scales[j-8] |= ((l & 0xF) << 4);
|
| 2136 |
+
}
|
| 2137 |
+
l >>= 4;
|
| 2138 |
+
y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
|
| 2139 |
+
}
|
| 2140 |
+
y[i].d = GGML_FP32_TO_FP16(d_block);
|
| 2141 |
+
|
| 2142 |
+
int8_t sc;
|
| 2143 |
+
for (int j = 0; j < QK_K/16; ++j) {
|
| 2144 |
+
sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
|
| 2145 |
+
sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
|
| 2146 |
+
float d = GGML_FP16_TO_FP32(y[i].d) * sc;
|
| 2147 |
+
if (!d) {
|
| 2148 |
+
continue;
|
| 2149 |
+
}
|
| 2150 |
+
for (int ii = 0; ii < 16; ++ii) {
|
| 2151 |
+
int l = nearest_int(x[16*j + ii]/d);
|
| 2152 |
+
l = MAX(-4, MIN(3, l));
|
| 2153 |
+
L[16*j + ii] = l + 4;
|
| 2154 |
+
}
|
| 2155 |
+
}
|
| 2156 |
+
|
| 2157 |
+
memset(y[i].hmask, 0, QK_K/8);
|
| 2158 |
+
// We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
|
| 2159 |
+
int m = 0;
|
| 2160 |
+
uint8_t hm = 1;
|
| 2161 |
+
for (int j = 0; j < QK_K; ++j) {
|
| 2162 |
+
if (L[j] > 3) {
|
| 2163 |
+
y[i].hmask[m] |= hm;
|
| 2164 |
+
L[j] -= 4;
|
| 2165 |
+
}
|
| 2166 |
+
if (++m == QK_K/8) {
|
| 2167 |
+
m = 0; hm <<= 1;
|
| 2168 |
+
}
|
| 2169 |
+
}
|
| 2170 |
+
for (int j = 0; j < QK_K; j += 128) {
|
| 2171 |
+
for (int l = 0; l < 32; ++l) {
|
| 2172 |
+
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
| 2173 |
+
}
|
| 2174 |
+
}
|
| 2175 |
+
|
| 2176 |
+
x += QK_K;
|
| 2177 |
+
}
|
| 2178 |
+
#endif
|
| 2179 |
+
}
|
| 2180 |
+
|
| 2181 |
+
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 2182 |
+
(void)hist;
|
| 2183 |
+
int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
| 2184 |
+
if (!quant_weights) {
|
| 2185 |
+
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
| 2186 |
+
}
|
| 2187 |
+
else {
|
| 2188 |
+
char * qrow = (char *)dst;
|
| 2189 |
+
for (int row = 0; row < nrow; ++row) {
|
| 2190 |
+
quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
|
| 2191 |
+
src += n_per_row;
|
| 2192 |
+
qrow += row_size;
|
| 2193 |
+
}
|
| 2194 |
+
}
|
| 2195 |
+
return nrow * row_size;
|
| 2196 |
+
}
|
| 2197 |
+
|
| 2198 |
// ====================== 4-bit (de)-quantization
|
| 2199 |
|
| 2200 |
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
|
|
|
|
| 2360 |
return (n/QK_K*sizeof(block_q4_K));
|
| 2361 |
}
|
| 2362 |
|
| 2363 |
+
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
|
| 2364 |
+
#if QK_K != 256
|
| 2365 |
+
(void)quant_weights;
|
| 2366 |
+
quantize_row_q4_K_reference(x, y, n_per_row);
|
| 2367 |
+
#else
|
| 2368 |
+
assert(n_per_row % QK_K == 0);
|
| 2369 |
+
const int nb = n_per_row / QK_K;
|
| 2370 |
+
|
| 2371 |
+
uint8_t L[QK_K];
|
| 2372 |
+
uint8_t Laux[32];
|
| 2373 |
+
float weights[32];
|
| 2374 |
+
float mins[QK_K/32];
|
| 2375 |
+
float scales[QK_K/32];
|
| 2376 |
+
|
| 2377 |
+
for (int i = 0; i < nb; i++) {
|
| 2378 |
+
|
| 2379 |
+
float sum_x2 = 0;
|
| 2380 |
+
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
| 2381 |
+
float sigma2 = sum_x2/QK_K;
|
| 2382 |
+
float av_x = sqrtf(sigma2);
|
| 2383 |
+
|
| 2384 |
+
float max_scale = 0; // as we are deducting the min, scales are always positive
|
| 2385 |
+
float max_min = 0;
|
| 2386 |
+
for (int j = 0; j < QK_K/32; ++j) {
|
| 2387 |
+
if (quant_weights) {
|
| 2388 |
+
const float * qw = quant_weights + QK_K*i + 32*j;
|
| 2389 |
+
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
|
| 2390 |
+
} else {
|
| 2391 |
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
| 2392 |
+
}
|
| 2393 |
+
scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
| 2394 |
+
//scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
|
| 2395 |
+
float scale = scales[j];
|
| 2396 |
+
if (scale > max_scale) {
|
| 2397 |
+
max_scale = scale;
|
| 2398 |
+
}
|
| 2399 |
+
float min = mins[j];
|
| 2400 |
+
if (min > max_min) {
|
| 2401 |
+
max_min = min;
|
| 2402 |
+
}
|
| 2403 |
+
}
|
| 2404 |
+
|
| 2405 |
+
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
|
| 2406 |
+
float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
|
| 2407 |
+
for (int j = 0; j < QK_K/32; ++j) {
|
| 2408 |
+
uint8_t ls = nearest_int(inv_scale*scales[j]);
|
| 2409 |
+
uint8_t lm = nearest_int(inv_min*mins[j]);
|
| 2410 |
+
ls = MIN(63, ls);
|
| 2411 |
+
lm = MIN(63, lm);
|
| 2412 |
+
if (j < 4) {
|
| 2413 |
+
y[i].scales[j] = ls;
|
| 2414 |
+
y[i].scales[j+4] = lm;
|
| 2415 |
+
} else {
|
| 2416 |
+
y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
|
| 2417 |
+
y[i].scales[j-4] |= ((ls >> 4) << 6);
|
| 2418 |
+
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
| 2419 |
+
}
|
| 2420 |
+
}
|
| 2421 |
+
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
|
| 2422 |
+
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
|
| 2423 |
+
|
| 2424 |
+
uint8_t sc, m;
|
| 2425 |
+
for (int j = 0; j < QK_K/32; ++j) {
|
| 2426 |
+
get_scale_min_k4(j, y[i].scales, &sc, &m);
|
| 2427 |
+
const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
|
| 2428 |
+
if (!d) continue;
|
| 2429 |
+
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
|
| 2430 |
+
for (int ii = 0; ii < 32; ++ii) {
|
| 2431 |
+
int l = nearest_int((x[32*j + ii] + dm)/d);
|
| 2432 |
+
l = MAX(0, MIN(15, l));
|
| 2433 |
+
L[32*j + ii] = l;
|
| 2434 |
+
}
|
| 2435 |
+
}
|
| 2436 |
+
uint8_t * q = y[i].qs;
|
| 2437 |
+
for (int j = 0; j < QK_K; j += 64) {
|
| 2438 |
+
for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
|
| 2439 |
+
q += 32;
|
| 2440 |
+
}
|
| 2441 |
+
|
| 2442 |
+
x += QK_K;
|
| 2443 |
+
|
| 2444 |
+
}
|
| 2445 |
+
#endif
|
| 2446 |
+
}
|
| 2447 |
+
|
| 2448 |
+
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 2449 |
+
(void)hist;
|
| 2450 |
+
int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
| 2451 |
+
if (!quant_weights) {
|
| 2452 |
+
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
| 2453 |
+
}
|
| 2454 |
+
else {
|
| 2455 |
+
char * qrow = (char *)dst;
|
| 2456 |
+
for (int row = 0; row < nrow; ++row) {
|
| 2457 |
+
quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
|
| 2458 |
+
src += n_per_row;
|
| 2459 |
+
qrow += row_size;
|
| 2460 |
+
}
|
| 2461 |
+
}
|
| 2462 |
+
return nrow * row_size;
|
| 2463 |
+
}
|
| 2464 |
+
|
| 2465 |
// ====================== 5-bit (de)-quantization
|
| 2466 |
|
| 2467 |
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
|
|
|
|
| 2557 |
#else
|
| 2558 |
float max_scale = 0, amax = 0;
|
| 2559 |
for (int j = 0; j < QK_K/16; ++j) {
|
| 2560 |
+
scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL);
|
| 2561 |
float abs_scale = fabsf(scales[j]);
|
| 2562 |
if (abs_scale > amax) {
|
| 2563 |
amax = abs_scale;
|
|
|
|
| 2668 |
return (n/QK_K*sizeof(block_q5_K));
|
| 2669 |
}
|
| 2670 |
|
| 2671 |
+
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
|
| 2672 |
+
#if QK_K != 256
|
| 2673 |
+
(void)quant_weights;
|
| 2674 |
+
quantize_row_q5_K_reference(x, y, n_per_row);
|
| 2675 |
+
#else
|
| 2676 |
+
assert(n_per_row % QK_K == 0);
|
| 2677 |
+
const int nb = n_per_row / QK_K;
|
| 2678 |
+
|
| 2679 |
+
uint8_t L[QK_K];
|
| 2680 |
+
float mins[QK_K/32];
|
| 2681 |
+
float scales[QK_K/32];
|
| 2682 |
+
float weights[32];
|
| 2683 |
+
uint8_t Laux[32];
|
| 2684 |
+
|
| 2685 |
+
for (int i = 0; i < nb; i++) {
|
| 2686 |
+
|
| 2687 |
+
float sum_x2 = 0;
|
| 2688 |
+
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
| 2689 |
+
float sigma2 = sum_x2/QK_K;
|
| 2690 |
+
float av_x = sqrtf(sigma2);
|
| 2691 |
+
|
| 2692 |
+
float max_scale = 0; // as we are deducting the min, scales are always positive
|
| 2693 |
+
float max_min = 0;
|
| 2694 |
+
for (int j = 0; j < QK_K/32; ++j) {
|
| 2695 |
+
if (quant_weights) {
|
| 2696 |
+
const float * qw = quant_weights + QK_K*i + 32*j;
|
| 2697 |
+
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
|
| 2698 |
+
} else {
|
| 2699 |
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
| 2700 |
+
}
|
| 2701 |
+
scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
| 2702 |
+
float scale = scales[j];
|
| 2703 |
+
if (scale > max_scale) {
|
| 2704 |
+
max_scale = scale;
|
| 2705 |
+
}
|
| 2706 |
+
float min = mins[j];
|
| 2707 |
+
if (min > max_min) {
|
| 2708 |
+
max_min = min;
|
| 2709 |
+
}
|
| 2710 |
+
}
|
| 2711 |
+
|
| 2712 |
+
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
|
| 2713 |
+
float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
|
| 2714 |
+
for (int j = 0; j < QK_K/32; ++j) {
|
| 2715 |
+
uint8_t ls = nearest_int(inv_scale*scales[j]);
|
| 2716 |
+
uint8_t lm = nearest_int(inv_min*mins[j]);
|
| 2717 |
+
ls = MIN(63, ls);
|
| 2718 |
+
lm = MIN(63, lm);
|
| 2719 |
+
if (j < 4) {
|
| 2720 |
+
y[i].scales[j] = ls;
|
| 2721 |
+
y[i].scales[j+4] = lm;
|
| 2722 |
+
} else {
|
| 2723 |
+
y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
|
| 2724 |
+
y[i].scales[j-4] |= ((ls >> 4) << 6);
|
| 2725 |
+
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
| 2726 |
+
}
|
| 2727 |
+
}
|
| 2728 |
+
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
|
| 2729 |
+
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
|
| 2730 |
+
|
| 2731 |
+
uint8_t sc, m;
|
| 2732 |
+
for (int j = 0; j < QK_K/32; ++j) {
|
| 2733 |
+
get_scale_min_k4(j, y[i].scales, &sc, &m);
|
| 2734 |
+
const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
|
| 2735 |
+
if (!d) continue;
|
| 2736 |
+
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
|
| 2737 |
+
for (int ii = 0; ii < 32; ++ii) {
|
| 2738 |
+
int l = nearest_int((x[32*j + ii] + dm)/d);
|
| 2739 |
+
l = MAX(0, MIN(31, l));
|
| 2740 |
+
L[32*j + ii] = l;
|
| 2741 |
+
}
|
| 2742 |
+
}
|
| 2743 |
+
|
| 2744 |
+
uint8_t * restrict qh = y[i].qh;
|
| 2745 |
+
uint8_t * restrict ql = y[i].qs;
|
| 2746 |
+
memset(qh, 0, QK_K/8);
|
| 2747 |
+
|
| 2748 |
+
uint8_t m1 = 1, m2 = 2;
|
| 2749 |
+
for (int n = 0; n < QK_K; n += 64) {
|
| 2750 |
+
for (int j = 0; j < 32; ++j) {
|
| 2751 |
+
int l1 = L[n + j];
|
| 2752 |
+
if (l1 > 15) {
|
| 2753 |
+
l1 -= 16; qh[j] |= m1;
|
| 2754 |
+
}
|
| 2755 |
+
int l2 = L[n + j + 32];
|
| 2756 |
+
if (l2 > 15) {
|
| 2757 |
+
l2 -= 16; qh[j] |= m2;
|
| 2758 |
+
}
|
| 2759 |
+
ql[j] = l1 | (l2 << 4);
|
| 2760 |
+
}
|
| 2761 |
+
m1 <<= 2; m2 <<= 2;
|
| 2762 |
+
ql += 32;
|
| 2763 |
+
}
|
| 2764 |
+
|
| 2765 |
+
x += QK_K;
|
| 2766 |
+
|
| 2767 |
+
}
|
| 2768 |
+
#endif
|
| 2769 |
+
}
|
| 2770 |
+
|
| 2771 |
+
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 2772 |
+
(void)hist;
|
| 2773 |
+
int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
| 2774 |
+
if (!quant_weights) {
|
| 2775 |
+
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
| 2776 |
+
}
|
| 2777 |
+
else {
|
| 2778 |
+
char * qrow = (char *)dst;
|
| 2779 |
+
for (int row = 0; row < nrow; ++row) {
|
| 2780 |
+
quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
|
| 2781 |
+
src += n_per_row;
|
| 2782 |
+
qrow += row_size;
|
| 2783 |
+
}
|
| 2784 |
+
}
|
| 2785 |
+
return nrow * row_size;
|
| 2786 |
+
}
|
| 2787 |
+
|
| 2788 |
// ====================== 6-bit (de)-quantization
|
| 2789 |
|
| 2790 |
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
|
|
|
| 2801 |
|
| 2802 |
for (int ib = 0; ib < QK_K/16; ++ib) {
|
| 2803 |
|
| 2804 |
+
const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
|
| 2805 |
scales[ib] = scale;
|
| 2806 |
|
| 2807 |
const float abs_scale = fabsf(scale);
|
|
|
|
| 2933 |
return (n/QK_K*sizeof(block_q6_K));
|
| 2934 |
}
|
| 2935 |
|
| 2936 |
+
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
|
| 2937 |
+
#if QK_K != 256
|
| 2938 |
+
(void)quant_weights;
|
| 2939 |
+
quantize_row_q6_K_reference(x, y, n_per_row);
|
| 2940 |
+
#else
|
| 2941 |
+
assert(n_per_row % QK_K == 0);
|
| 2942 |
+
const int nb = n_per_row / QK_K;
|
| 2943 |
+
|
| 2944 |
+
int8_t L[QK_K];
|
| 2945 |
+
float scales[QK_K/16];
|
| 2946 |
+
//float weights[16];
|
| 2947 |
+
|
| 2948 |
+
for (int i = 0; i < nb; i++) {
|
| 2949 |
+
|
| 2950 |
+
//float sum_x2 = 0;
|
| 2951 |
+
//for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j];
|
| 2952 |
+
//float sigma2 = sum_x2/QK_K;
|
| 2953 |
+
|
| 2954 |
+
float max_scale = 0;
|
| 2955 |
+
float max_abs_scale = 0;
|
| 2956 |
+
|
| 2957 |
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
| 2958 |
+
|
| 2959 |
+
float scale;
|
| 2960 |
+
if (quant_weights) {
|
| 2961 |
+
const float * qw = quant_weights + QK_K*i + 16*ib;
|
| 2962 |
+
//for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]);
|
| 2963 |
+
//scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights);
|
| 2964 |
+
scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw);
|
| 2965 |
+
} else {
|
| 2966 |
+
scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
|
| 2967 |
+
}
|
| 2968 |
+
scales[ib] = scale;
|
| 2969 |
+
|
| 2970 |
+
const float abs_scale = fabsf(scale);
|
| 2971 |
+
if (abs_scale > max_abs_scale) {
|
| 2972 |
+
max_abs_scale = abs_scale;
|
| 2973 |
+
max_scale = scale;
|
| 2974 |
+
}
|
| 2975 |
+
|
| 2976 |
+
}
|
| 2977 |
+
|
| 2978 |
+
if (!max_abs_scale) {
|
| 2979 |
+
memset(&y[i], 0, sizeof(block_q6_K));
|
| 2980 |
+
y[i].d = GGML_FP32_TO_FP16(0.f);
|
| 2981 |
+
x += QK_K;
|
| 2982 |
+
continue;
|
| 2983 |
+
}
|
| 2984 |
+
|
| 2985 |
+
float iscale = -128.f/max_scale;
|
| 2986 |
+
y[i].d = GGML_FP32_TO_FP16(1/iscale);
|
| 2987 |
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
| 2988 |
+
y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
|
| 2989 |
+
}
|
| 2990 |
+
|
| 2991 |
+
for (int j = 0; j < QK_K/16; ++j) {
|
| 2992 |
+
float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
|
| 2993 |
+
if (!d) {
|
| 2994 |
+
continue;
|
| 2995 |
+
}
|
| 2996 |
+
for (int ii = 0; ii < 16; ++ii) {
|
| 2997 |
+
int l = nearest_int(x[16*j + ii]/d);
|
| 2998 |
+
l = MAX(-32, MIN(31, l));
|
| 2999 |
+
L[16*j + ii] = l + 32;
|
| 3000 |
+
}
|
| 3001 |
+
}
|
| 3002 |
+
|
| 3003 |
+
uint8_t * restrict ql = y[i].ql;
|
| 3004 |
+
uint8_t * restrict qh = y[i].qh;
|
| 3005 |
+
for (int j = 0; j < QK_K; j += 128) {
|
| 3006 |
+
for (int l = 0; l < 32; ++l) {
|
| 3007 |
+
const uint8_t q1 = L[j + l + 0] & 0xF;
|
| 3008 |
+
const uint8_t q2 = L[j + l + 32] & 0xF;
|
| 3009 |
+
const uint8_t q3 = L[j + l + 64] & 0xF;
|
| 3010 |
+
const uint8_t q4 = L[j + l + 96] & 0xF;
|
| 3011 |
+
ql[l+ 0] = q1 | (q3 << 4);
|
| 3012 |
+
ql[l+32] = q2 | (q4 << 4);
|
| 3013 |
+
qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
|
| 3014 |
+
}
|
| 3015 |
+
ql += 64;
|
| 3016 |
+
qh += 32;
|
| 3017 |
+
}
|
| 3018 |
+
|
| 3019 |
+
x += QK_K;
|
| 3020 |
+
|
| 3021 |
+
}
|
| 3022 |
+
#endif
|
| 3023 |
+
}
|
| 3024 |
+
|
| 3025 |
+
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 3026 |
+
(void)hist;
|
| 3027 |
+
int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
| 3028 |
+
if (!quant_weights) {
|
| 3029 |
+
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
| 3030 |
+
}
|
| 3031 |
+
else {
|
| 3032 |
+
char * qrow = (char *)dst;
|
| 3033 |
+
for (int row = 0; row < nrow; ++row) {
|
| 3034 |
+
quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
|
| 3035 |
+
src += n_per_row;
|
| 3036 |
+
qrow += row_size;
|
| 3037 |
+
}
|
| 3038 |
+
}
|
| 3039 |
+
return nrow * row_size;
|
| 3040 |
+
}
|
| 3041 |
+
|
| 3042 |
// ====================== "True" 2-bit (de)-quantization
|
| 3043 |
|
| 3044 |
static const uint64_t iq2xxs_grid[256] = {
|
ggml-quants.h
CHANGED
|
@@ -249,4 +249,7 @@ void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict
|
|
| 249 |
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 250 |
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 251 |
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 250 |
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 251 |
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 252 |
+
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 253 |
+
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 254 |
+
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 255 |
+
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
ggml.c
CHANGED
|
@@ -18713,26 +18713,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
| 18713 |
case GGML_TYPE_Q3_K:
|
| 18714 |
{
|
| 18715 |
GGML_ASSERT(start % QK_K == 0);
|
| 18716 |
-
|
| 18717 |
-
|
|
|
|
|
|
|
|
|
|
| 18718 |
} break;
|
| 18719 |
case GGML_TYPE_Q4_K:
|
| 18720 |
{
|
| 18721 |
GGML_ASSERT(start % QK_K == 0);
|
| 18722 |
-
|
| 18723 |
-
|
|
|
|
|
|
|
|
|
|
| 18724 |
} break;
|
| 18725 |
case GGML_TYPE_Q5_K:
|
| 18726 |
{
|
| 18727 |
GGML_ASSERT(start % QK_K == 0);
|
| 18728 |
-
|
| 18729 |
-
|
|
|
|
|
|
|
|
|
|
| 18730 |
} break;
|
| 18731 |
case GGML_TYPE_Q6_K:
|
| 18732 |
{
|
| 18733 |
GGML_ASSERT(start % QK_K == 0);
|
| 18734 |
-
|
| 18735 |
-
|
|
|
|
|
|
|
|
|
|
| 18736 |
} break;
|
| 18737 |
case GGML_TYPE_IQ2_XXS:
|
| 18738 |
{
|
|
|
|
| 18713 |
case GGML_TYPE_Q3_K:
|
| 18714 |
{
|
| 18715 |
GGML_ASSERT(start % QK_K == 0);
|
| 18716 |
+
GGML_ASSERT(start % n_per_row == 0);
|
| 18717 |
+
size_t start_row = start / n_per_row;
|
| 18718 |
+
size_t row_size = ggml_row_size(type, n_per_row);
|
| 18719 |
+
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 18720 |
+
GGML_ASSERT(result == row_size * nrows);
|
| 18721 |
} break;
|
| 18722 |
case GGML_TYPE_Q4_K:
|
| 18723 |
{
|
| 18724 |
GGML_ASSERT(start % QK_K == 0);
|
| 18725 |
+
GGML_ASSERT(start % n_per_row == 0);
|
| 18726 |
+
size_t start_row = start / n_per_row;
|
| 18727 |
+
size_t row_size = ggml_row_size(type, n_per_row);
|
| 18728 |
+
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 18729 |
+
GGML_ASSERT(result == row_size * nrows);
|
| 18730 |
} break;
|
| 18731 |
case GGML_TYPE_Q5_K:
|
| 18732 |
{
|
| 18733 |
GGML_ASSERT(start % QK_K == 0);
|
| 18734 |
+
GGML_ASSERT(start % n_per_row == 0);
|
| 18735 |
+
size_t start_row = start / n_per_row;
|
| 18736 |
+
size_t row_size = ggml_row_size(type, n_per_row);
|
| 18737 |
+
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 18738 |
+
GGML_ASSERT(result == row_size * nrows);
|
| 18739 |
} break;
|
| 18740 |
case GGML_TYPE_Q6_K:
|
| 18741 |
{
|
| 18742 |
GGML_ASSERT(start % QK_K == 0);
|
| 18743 |
+
GGML_ASSERT(start % n_per_row == 0);
|
| 18744 |
+
size_t start_row = start / n_per_row;
|
| 18745 |
+
size_t row_size = ggml_row_size(type, n_per_row);
|
| 18746 |
+
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 18747 |
+
GGML_ASSERT(result == row_size * nrows);
|
| 18748 |
} break;
|
| 18749 |
case GGML_TYPE_IQ2_XXS:
|
| 18750 |
{
|