Kawrakow ikawrakow commited on
Commit
7032309
·
unverified ·
1 Parent(s): 542e8da

Add ability to use importance matrix for all k-quants (llama/4930)

Browse files
Files changed (3) hide show
  1. ggml-quants.c +437 -6
  2. ggml-quants.h +4 -1
  3. ggml.c +20 -8
ggml-quants.c CHANGED
@@ -1244,7 +1244,8 @@ static inline int nearest_int(float fval) {
1244
  return (i & 0x007fffff) - 0x00400000;
1245
  }
1246
 
1247
- static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
 
1248
  float max = 0;
1249
  float amax = 0;
1250
  for (int i = 0; i < n; ++i) {
@@ -1270,14 +1271,13 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1270
  rmse_type = -rmse_type;
1271
  return_early = true;
1272
  }
1273
- int weight_type = rmse_type%2;
1274
  float sumlx = 0;
1275
  float suml2 = 0;
1276
  for (int i = 0; i < n; ++i) {
1277
  int l = nearest_int(iscale * x[i]);
1278
  l = MAX(-nmax, MIN(nmax-1, l));
1279
  L[i] = l + nmax;
1280
- float w = weight_type == 1 ? x[i] * x[i] : 1;
1281
  sumlx += w*x[i]*l;
1282
  suml2 += w*l*l;
1283
  }
@@ -1293,7 +1293,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1293
  for (int i = 0; i < n; ++i) {
1294
  int l = nearest_int(iscale * x[i]);
1295
  l = MAX(-nmax, MIN(nmax-1, l));
1296
- float w = weight_type == 1 ? x[i] * x[i] : 1;
1297
  sumlx += w*x[i]*l;
1298
  suml2 += w*l*l;
1299
  }
@@ -2089,6 +2089,112 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
2089
  return (n/QK_K*sizeof(block_q3_K));
2090
  }
2091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2092
  // ====================== 4-bit (de)-quantization
2093
 
2094
  void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
@@ -2254,6 +2360,108 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
2254
  return (n/QK_K*sizeof(block_q4_K));
2255
  }
2256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2257
  // ====================== 5-bit (de)-quantization
2258
 
2259
  void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
@@ -2349,7 +2557,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
2349
  #else
2350
  float max_scale = 0, amax = 0;
2351
  for (int j = 0; j < QK_K/16; ++j) {
2352
- scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
2353
  float abs_scale = fabsf(scales[j]);
2354
  if (abs_scale > amax) {
2355
  amax = abs_scale;
@@ -2460,6 +2668,123 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
2460
  return (n/QK_K*sizeof(block_q5_K));
2461
  }
2462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2463
  // ====================== 6-bit (de)-quantization
2464
 
2465
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
@@ -2476,7 +2801,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2476
 
2477
  for (int ib = 0; ib < QK_K/16; ++ib) {
2478
 
2479
- const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1);
2480
  scales[ib] = scale;
2481
 
2482
  const float abs_scale = fabsf(scale);
@@ -2608,6 +2933,112 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
2608
  return (n/QK_K*sizeof(block_q6_K));
2609
  }
2610
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2611
  // ====================== "True" 2-bit (de)-quantization
2612
 
2613
  static const uint64_t iq2xxs_grid[256] = {
 
1244
  return (i & 0x007fffff) - 0x00400000;
1245
  }
1246
 
1247
+ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
1248
+ const float * restrict qw) {
1249
  float max = 0;
1250
  float amax = 0;
1251
  for (int i = 0; i < n; ++i) {
 
1271
  rmse_type = -rmse_type;
1272
  return_early = true;
1273
  }
 
1274
  float sumlx = 0;
1275
  float suml2 = 0;
1276
  for (int i = 0; i < n; ++i) {
1277
  int l = nearest_int(iscale * x[i]);
1278
  l = MAX(-nmax, MIN(nmax-1, l));
1279
  L[i] = l + nmax;
1280
+ float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
1281
  sumlx += w*x[i]*l;
1282
  suml2 += w*l*l;
1283
  }
 
1293
  for (int i = 0; i < n; ++i) {
1294
  int l = nearest_int(iscale * x[i]);
1295
  l = MAX(-nmax, MIN(nmax-1, l));
1296
+ float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
1297
  sumlx += w*x[i]*l;
1298
  suml2 += w*l*l;
1299
  }
 
2089
  return (n/QK_K*sizeof(block_q3_K));
2090
  }
2091
 
2092
+ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2093
+ #if QK_K != 256
2094
+ (void)quant_weights;
2095
+ quantize_row_q3_K_reference(x, y, n_per_row);
2096
+ #else
2097
+ assert(n_per_row % QK_K == 0);
2098
+ const int nb = n_per_row / QK_K;
2099
+
2100
+ int8_t L[QK_K];
2101
+ float scales[QK_K / 16];
2102
+ float weight[16];
2103
+ float sw[QK_K / 16];
2104
+ int8_t Ls[QK_K / 16];
2105
+
2106
+ for (int i = 0; i < nb; i++) {
2107
+
2108
+ float sumx2 = 0;
2109
+ for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
2110
+ float sigma2 = 2*sumx2/QK_K;
2111
+
2112
+ for (int j = 0; j < QK_K/16; ++j) {
2113
+ if (quant_weights) {
2114
+ const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
2115
+ for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
2116
+ } else {
2117
+ for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
2118
+ }
2119
+ float sumw = 0;
2120
+ for (int l = 0; l < 16; ++l) sumw += weight[l];
2121
+ sw[j] = sumw;
2122
+
2123
+ scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
2124
+
2125
+ }
2126
+
2127
+ memset(y[i].scales, 0, 12);
2128
+
2129
+ float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
2130
+ for (int j = 0; j < QK_K/16; ++j) {
2131
+ int l = Ls[j];
2132
+ if (j < 8) {
2133
+ y[i].scales[j] = l & 0xF;
2134
+ } else {
2135
+ y[i].scales[j-8] |= ((l & 0xF) << 4);
2136
+ }
2137
+ l >>= 4;
2138
+ y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
2139
+ }
2140
+ y[i].d = GGML_FP32_TO_FP16(d_block);
2141
+
2142
+ int8_t sc;
2143
+ for (int j = 0; j < QK_K/16; ++j) {
2144
+ sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
2145
+ sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
2146
+ float d = GGML_FP16_TO_FP32(y[i].d) * sc;
2147
+ if (!d) {
2148
+ continue;
2149
+ }
2150
+ for (int ii = 0; ii < 16; ++ii) {
2151
+ int l = nearest_int(x[16*j + ii]/d);
2152
+ l = MAX(-4, MIN(3, l));
2153
+ L[16*j + ii] = l + 4;
2154
+ }
2155
+ }
2156
+
2157
+ memset(y[i].hmask, 0, QK_K/8);
2158
+ // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
2159
+ int m = 0;
2160
+ uint8_t hm = 1;
2161
+ for (int j = 0; j < QK_K; ++j) {
2162
+ if (L[j] > 3) {
2163
+ y[i].hmask[m] |= hm;
2164
+ L[j] -= 4;
2165
+ }
2166
+ if (++m == QK_K/8) {
2167
+ m = 0; hm <<= 1;
2168
+ }
2169
+ }
2170
+ for (int j = 0; j < QK_K; j += 128) {
2171
+ for (int l = 0; l < 32; ++l) {
2172
+ y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
2173
+ }
2174
+ }
2175
+
2176
+ x += QK_K;
2177
+ }
2178
+ #endif
2179
+ }
2180
+
2181
+ size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2182
+ (void)hist;
2183
+ int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2184
+ if (!quant_weights) {
2185
+ quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
2186
+ }
2187
+ else {
2188
+ char * qrow = (char *)dst;
2189
+ for (int row = 0; row < nrow; ++row) {
2190
+ quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
2191
+ src += n_per_row;
2192
+ qrow += row_size;
2193
+ }
2194
+ }
2195
+ return nrow * row_size;
2196
+ }
2197
+
2198
  // ====================== 4-bit (de)-quantization
2199
 
2200
  void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
 
2360
  return (n/QK_K*sizeof(block_q4_K));
2361
  }
2362
 
2363
+ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2364
+ #if QK_K != 256
2365
+ (void)quant_weights;
2366
+ quantize_row_q4_K_reference(x, y, n_per_row);
2367
+ #else
2368
+ assert(n_per_row % QK_K == 0);
2369
+ const int nb = n_per_row / QK_K;
2370
+
2371
+ uint8_t L[QK_K];
2372
+ uint8_t Laux[32];
2373
+ float weights[32];
2374
+ float mins[QK_K/32];
2375
+ float scales[QK_K/32];
2376
+
2377
+ for (int i = 0; i < nb; i++) {
2378
+
2379
+ float sum_x2 = 0;
2380
+ for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2381
+ float sigma2 = sum_x2/QK_K;
2382
+ float av_x = sqrtf(sigma2);
2383
+
2384
+ float max_scale = 0; // as we are deducting the min, scales are always positive
2385
+ float max_min = 0;
2386
+ for (int j = 0; j < QK_K/32; ++j) {
2387
+ if (quant_weights) {
2388
+ const float * qw = quant_weights + QK_K*i + 32*j;
2389
+ for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
2390
+ } else {
2391
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2392
+ }
2393
+ scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2394
+ //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
2395
+ float scale = scales[j];
2396
+ if (scale > max_scale) {
2397
+ max_scale = scale;
2398
+ }
2399
+ float min = mins[j];
2400
+ if (min > max_min) {
2401
+ max_min = min;
2402
+ }
2403
+ }
2404
+
2405
+ float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2406
+ float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2407
+ for (int j = 0; j < QK_K/32; ++j) {
2408
+ uint8_t ls = nearest_int(inv_scale*scales[j]);
2409
+ uint8_t lm = nearest_int(inv_min*mins[j]);
2410
+ ls = MIN(63, ls);
2411
+ lm = MIN(63, lm);
2412
+ if (j < 4) {
2413
+ y[i].scales[j] = ls;
2414
+ y[i].scales[j+4] = lm;
2415
+ } else {
2416
+ y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
2417
+ y[i].scales[j-4] |= ((ls >> 4) << 6);
2418
+ y[i].scales[j-0] |= ((lm >> 4) << 6);
2419
+ }
2420
+ }
2421
+ y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2422
+ y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2423
+
2424
+ uint8_t sc, m;
2425
+ for (int j = 0; j < QK_K/32; ++j) {
2426
+ get_scale_min_k4(j, y[i].scales, &sc, &m);
2427
+ const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
2428
+ if (!d) continue;
2429
+ const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
2430
+ for (int ii = 0; ii < 32; ++ii) {
2431
+ int l = nearest_int((x[32*j + ii] + dm)/d);
2432
+ l = MAX(0, MIN(15, l));
2433
+ L[32*j + ii] = l;
2434
+ }
2435
+ }
2436
+ uint8_t * q = y[i].qs;
2437
+ for (int j = 0; j < QK_K; j += 64) {
2438
+ for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
2439
+ q += 32;
2440
+ }
2441
+
2442
+ x += QK_K;
2443
+
2444
+ }
2445
+ #endif
2446
+ }
2447
+
2448
+ size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2449
+ (void)hist;
2450
+ int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2451
+ if (!quant_weights) {
2452
+ quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
2453
+ }
2454
+ else {
2455
+ char * qrow = (char *)dst;
2456
+ for (int row = 0; row < nrow; ++row) {
2457
+ quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
2458
+ src += n_per_row;
2459
+ qrow += row_size;
2460
+ }
2461
+ }
2462
+ return nrow * row_size;
2463
+ }
2464
+
2465
  // ====================== 5-bit (de)-quantization
2466
 
2467
  void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
 
2557
  #else
2558
  float max_scale = 0, amax = 0;
2559
  for (int j = 0; j < QK_K/16; ++j) {
2560
+ scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL);
2561
  float abs_scale = fabsf(scales[j]);
2562
  if (abs_scale > amax) {
2563
  amax = abs_scale;
 
2668
  return (n/QK_K*sizeof(block_q5_K));
2669
  }
2670
 
2671
+ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2672
+ #if QK_K != 256
2673
+ (void)quant_weights;
2674
+ quantize_row_q5_K_reference(x, y, n_per_row);
2675
+ #else
2676
+ assert(n_per_row % QK_K == 0);
2677
+ const int nb = n_per_row / QK_K;
2678
+
2679
+ uint8_t L[QK_K];
2680
+ float mins[QK_K/32];
2681
+ float scales[QK_K/32];
2682
+ float weights[32];
2683
+ uint8_t Laux[32];
2684
+
2685
+ for (int i = 0; i < nb; i++) {
2686
+
2687
+ float sum_x2 = 0;
2688
+ for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2689
+ float sigma2 = sum_x2/QK_K;
2690
+ float av_x = sqrtf(sigma2);
2691
+
2692
+ float max_scale = 0; // as we are deducting the min, scales are always positive
2693
+ float max_min = 0;
2694
+ for (int j = 0; j < QK_K/32; ++j) {
2695
+ if (quant_weights) {
2696
+ const float * qw = quant_weights + QK_K*i + 32*j;
2697
+ for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
2698
+ } else {
2699
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2700
+ }
2701
+ scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2702
+ float scale = scales[j];
2703
+ if (scale > max_scale) {
2704
+ max_scale = scale;
2705
+ }
2706
+ float min = mins[j];
2707
+ if (min > max_min) {
2708
+ max_min = min;
2709
+ }
2710
+ }
2711
+
2712
+ float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2713
+ float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2714
+ for (int j = 0; j < QK_K/32; ++j) {
2715
+ uint8_t ls = nearest_int(inv_scale*scales[j]);
2716
+ uint8_t lm = nearest_int(inv_min*mins[j]);
2717
+ ls = MIN(63, ls);
2718
+ lm = MIN(63, lm);
2719
+ if (j < 4) {
2720
+ y[i].scales[j] = ls;
2721
+ y[i].scales[j+4] = lm;
2722
+ } else {
2723
+ y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
2724
+ y[i].scales[j-4] |= ((ls >> 4) << 6);
2725
+ y[i].scales[j-0] |= ((lm >> 4) << 6);
2726
+ }
2727
+ }
2728
+ y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2729
+ y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2730
+
2731
+ uint8_t sc, m;
2732
+ for (int j = 0; j < QK_K/32; ++j) {
2733
+ get_scale_min_k4(j, y[i].scales, &sc, &m);
2734
+ const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
2735
+ if (!d) continue;
2736
+ const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
2737
+ for (int ii = 0; ii < 32; ++ii) {
2738
+ int l = nearest_int((x[32*j + ii] + dm)/d);
2739
+ l = MAX(0, MIN(31, l));
2740
+ L[32*j + ii] = l;
2741
+ }
2742
+ }
2743
+
2744
+ uint8_t * restrict qh = y[i].qh;
2745
+ uint8_t * restrict ql = y[i].qs;
2746
+ memset(qh, 0, QK_K/8);
2747
+
2748
+ uint8_t m1 = 1, m2 = 2;
2749
+ for (int n = 0; n < QK_K; n += 64) {
2750
+ for (int j = 0; j < 32; ++j) {
2751
+ int l1 = L[n + j];
2752
+ if (l1 > 15) {
2753
+ l1 -= 16; qh[j] |= m1;
2754
+ }
2755
+ int l2 = L[n + j + 32];
2756
+ if (l2 > 15) {
2757
+ l2 -= 16; qh[j] |= m2;
2758
+ }
2759
+ ql[j] = l1 | (l2 << 4);
2760
+ }
2761
+ m1 <<= 2; m2 <<= 2;
2762
+ ql += 32;
2763
+ }
2764
+
2765
+ x += QK_K;
2766
+
2767
+ }
2768
+ #endif
2769
+ }
2770
+
2771
+ size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2772
+ (void)hist;
2773
+ int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2774
+ if (!quant_weights) {
2775
+ quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
2776
+ }
2777
+ else {
2778
+ char * qrow = (char *)dst;
2779
+ for (int row = 0; row < nrow; ++row) {
2780
+ quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
2781
+ src += n_per_row;
2782
+ qrow += row_size;
2783
+ }
2784
+ }
2785
+ return nrow * row_size;
2786
+ }
2787
+
2788
  // ====================== 6-bit (de)-quantization
2789
 
2790
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
 
2801
 
2802
  for (int ib = 0; ib < QK_K/16; ++ib) {
2803
 
2804
+ const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
2805
  scales[ib] = scale;
2806
 
2807
  const float abs_scale = fabsf(scale);
 
2933
  return (n/QK_K*sizeof(block_q6_K));
2934
  }
2935
 
2936
+ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
2937
+ #if QK_K != 256
2938
+ (void)quant_weights;
2939
+ quantize_row_q6_K_reference(x, y, n_per_row);
2940
+ #else
2941
+ assert(n_per_row % QK_K == 0);
2942
+ const int nb = n_per_row / QK_K;
2943
+
2944
+ int8_t L[QK_K];
2945
+ float scales[QK_K/16];
2946
+ //float weights[16];
2947
+
2948
+ for (int i = 0; i < nb; i++) {
2949
+
2950
+ //float sum_x2 = 0;
2951
+ //for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j];
2952
+ //float sigma2 = sum_x2/QK_K;
2953
+
2954
+ float max_scale = 0;
2955
+ float max_abs_scale = 0;
2956
+
2957
+ for (int ib = 0; ib < QK_K/16; ++ib) {
2958
+
2959
+ float scale;
2960
+ if (quant_weights) {
2961
+ const float * qw = quant_weights + QK_K*i + 16*ib;
2962
+ //for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]);
2963
+ //scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights);
2964
+ scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw);
2965
+ } else {
2966
+ scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
2967
+ }
2968
+ scales[ib] = scale;
2969
+
2970
+ const float abs_scale = fabsf(scale);
2971
+ if (abs_scale > max_abs_scale) {
2972
+ max_abs_scale = abs_scale;
2973
+ max_scale = scale;
2974
+ }
2975
+
2976
+ }
2977
+
2978
+ if (!max_abs_scale) {
2979
+ memset(&y[i], 0, sizeof(block_q6_K));
2980
+ y[i].d = GGML_FP32_TO_FP16(0.f);
2981
+ x += QK_K;
2982
+ continue;
2983
+ }
2984
+
2985
+ float iscale = -128.f/max_scale;
2986
+ y[i].d = GGML_FP32_TO_FP16(1/iscale);
2987
+ for (int ib = 0; ib < QK_K/16; ++ib) {
2988
+ y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
2989
+ }
2990
+
2991
+ for (int j = 0; j < QK_K/16; ++j) {
2992
+ float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
2993
+ if (!d) {
2994
+ continue;
2995
+ }
2996
+ for (int ii = 0; ii < 16; ++ii) {
2997
+ int l = nearest_int(x[16*j + ii]/d);
2998
+ l = MAX(-32, MIN(31, l));
2999
+ L[16*j + ii] = l + 32;
3000
+ }
3001
+ }
3002
+
3003
+ uint8_t * restrict ql = y[i].ql;
3004
+ uint8_t * restrict qh = y[i].qh;
3005
+ for (int j = 0; j < QK_K; j += 128) {
3006
+ for (int l = 0; l < 32; ++l) {
3007
+ const uint8_t q1 = L[j + l + 0] & 0xF;
3008
+ const uint8_t q2 = L[j + l + 32] & 0xF;
3009
+ const uint8_t q3 = L[j + l + 64] & 0xF;
3010
+ const uint8_t q4 = L[j + l + 96] & 0xF;
3011
+ ql[l+ 0] = q1 | (q3 << 4);
3012
+ ql[l+32] = q2 | (q4 << 4);
3013
+ qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
3014
+ }
3015
+ ql += 64;
3016
+ qh += 32;
3017
+ }
3018
+
3019
+ x += QK_K;
3020
+
3021
+ }
3022
+ #endif
3023
+ }
3024
+
3025
+ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3026
+ (void)hist;
3027
+ int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3028
+ if (!quant_weights) {
3029
+ quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
3030
+ }
3031
+ else {
3032
+ char * qrow = (char *)dst;
3033
+ for (int row = 0; row < nrow; ++row) {
3034
+ quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
3035
+ src += n_per_row;
3036
+ qrow += row_size;
3037
+ }
3038
+ }
3039
+ return nrow * row_size;
3040
+ }
3041
+
3042
  // ====================== "True" 2-bit (de)-quantization
3043
 
3044
  static const uint64_t iq2xxs_grid[256] = {
ggml-quants.h CHANGED
@@ -249,4 +249,7 @@ void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict
249
  size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
250
  size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
251
  size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
252
-
 
 
 
 
249
  size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
250
  size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
251
  size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
252
+ size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
253
+ size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
254
+ size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
255
+ size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
ggml.c CHANGED
@@ -18713,26 +18713,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18713
  case GGML_TYPE_Q3_K:
18714
  {
18715
  GGML_ASSERT(start % QK_K == 0);
18716
- block_q3_K * block = (block_q3_K*)dst + start / QK_K;
18717
- result = ggml_quantize_q3_K(src + start, block, n, n, hist);
 
 
 
18718
  } break;
18719
  case GGML_TYPE_Q4_K:
18720
  {
18721
  GGML_ASSERT(start % QK_K == 0);
18722
- block_q4_K * block = (block_q4_K*)dst + start / QK_K;
18723
- result = ggml_quantize_q4_K(src + start, block, n, n, hist);
 
 
 
18724
  } break;
18725
  case GGML_TYPE_Q5_K:
18726
  {
18727
  GGML_ASSERT(start % QK_K == 0);
18728
- block_q5_K * block = (block_q5_K*)dst + start / QK_K;
18729
- result = ggml_quantize_q5_K(src + start, block, n, n, hist);
 
 
 
18730
  } break;
18731
  case GGML_TYPE_Q6_K:
18732
  {
18733
  GGML_ASSERT(start % QK_K == 0);
18734
- block_q6_K * block = (block_q6_K*)dst + start / QK_K;
18735
- result = ggml_quantize_q6_K(src + start, block, n, n, hist);
 
 
 
18736
  } break;
18737
  case GGML_TYPE_IQ2_XXS:
18738
  {
 
18713
  case GGML_TYPE_Q3_K:
18714
  {
18715
  GGML_ASSERT(start % QK_K == 0);
18716
+ GGML_ASSERT(start % n_per_row == 0);
18717
+ size_t start_row = start / n_per_row;
18718
+ size_t row_size = ggml_row_size(type, n_per_row);
18719
+ result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18720
+ GGML_ASSERT(result == row_size * nrows);
18721
  } break;
18722
  case GGML_TYPE_Q4_K:
18723
  {
18724
  GGML_ASSERT(start % QK_K == 0);
18725
+ GGML_ASSERT(start % n_per_row == 0);
18726
+ size_t start_row = start / n_per_row;
18727
+ size_t row_size = ggml_row_size(type, n_per_row);
18728
+ result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18729
+ GGML_ASSERT(result == row_size * nrows);
18730
  } break;
18731
  case GGML_TYPE_Q5_K:
18732
  {
18733
  GGML_ASSERT(start % QK_K == 0);
18734
+ GGML_ASSERT(start % n_per_row == 0);
18735
+ size_t start_row = start / n_per_row;
18736
+ size_t row_size = ggml_row_size(type, n_per_row);
18737
+ result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18738
+ GGML_ASSERT(result == row_size * nrows);
18739
  } break;
18740
  case GGML_TYPE_Q6_K:
18741
  {
18742
  GGML_ASSERT(start % QK_K == 0);
18743
+ GGML_ASSERT(start % n_per_row == 0);
18744
+ size_t start_row = start / n_per_row;
18745
+ size_t row_size = ggml_row_size(type, n_per_row);
18746
+ result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18747
+ GGML_ASSERT(result == row_size * nrows);
18748
  } break;
18749
  case GGML_TYPE_IQ2_XXS:
18750
  {