ggerganov commited on
Commit
227f2ae
·
unverified ·
1 Parent(s): 6490f98

ggml : add IQ2 to test-backend-ops + refactoring (llama/4990)

Browse files

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci

Files changed (6) hide show
  1. ggml-backend.c +2 -0
  2. ggml-cuda.cu +9 -3
  3. ggml-quants.c +30 -44
  4. ggml-quants.h +3 -0
  5. ggml.c +31 -3
  6. ggml.h +16 -4
ggml-backend.c CHANGED
@@ -692,6 +692,8 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
692
 
693
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
694
  switch (op->op) {
 
 
695
  case GGML_OP_MUL_MAT:
696
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
697
  default:
 
692
 
693
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
694
  switch (op->op) {
695
+ case GGML_OP_CPY:
696
+ return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
697
  case GGML_OP_MUL_MAT:
698
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
699
  default:
ggml-cuda.cu CHANGED
@@ -5131,10 +5131,10 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
5131
  const block_q_t * x = (const block_q_t *) vx;
5132
  const block_q8_1 * y = (const block_q8_1 *) vy;
5133
 
5134
- for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
5135
- const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
5136
 
5137
- const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
5138
 
5139
  const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
5140
 
@@ -10918,6 +10918,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
10918
  if (a->ne[3] != b->ne[3]) {
10919
  return false;
10920
  }
 
 
 
 
 
 
10921
  return true;
10922
  } break;
10923
  case GGML_OP_GET_ROWS:
 
5131
  const block_q_t * x = (const block_q_t *) vx;
5132
  const block_q8_1 * y = (const block_q8_1 *) vy;
5133
 
5134
+ for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
5135
+ const int ibx = row*blocks_per_row + i; // x block index
5136
 
5137
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
5138
 
5139
  const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
5140
 
 
10918
  if (a->ne[3] != b->ne[3]) {
10919
  return false;
10920
  }
10921
+ ggml_type a_type = a->type;
10922
+ if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
10923
+ if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
10924
+ return false;
10925
+ }
10926
+ }
10927
  return true;
10928
  } break;
10929
  case GGML_OP_GET_ROWS:
ggml-quants.c CHANGED
@@ -1274,7 +1274,12 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1274
  }
1275
  float sumlx = 0;
1276
  float suml2 = 0;
 
 
 
 
1277
  for (int i = 0; i < n; ++i) {
 
1278
  int l = nearest_int(iscale * x[i]);
1279
  l = MAX(-nmax, MIN(nmax-1, l));
1280
  L[i] = l + nmax;
@@ -1649,7 +1654,12 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
1649
  float max = x[0];
1650
  float sum_w = weights ? weights[0] : x[0]*x[0];
1651
  float sum_x = sum_w * x[0];
 
 
 
 
1652
  for (int i = 1; i < n; ++i) {
 
1653
  if (x[i] < min) min = x[i];
1654
  if (x[i] > max) max = x[i];
1655
  float w = weights ? weights[i] : x[i]*x[i];
@@ -1660,7 +1670,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
1660
  min = 0;
1661
  }
1662
  if (max <= min) {
1663
- for (int i = 0; i < n; ++i) L[i] = 0;
1664
  *the_min = -min;
1665
  return 0.f;
1666
  }
@@ -1862,7 +1872,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1862
 
1863
  size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
1864
  (void)hist;
1865
- int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1866
  if (!quant_weights) {
1867
  quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
1868
  }
@@ -2181,7 +2191,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
2181
 
2182
  size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2183
  (void)hist;
2184
- int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2185
  if (!quant_weights) {
2186
  quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
2187
  }
@@ -2448,7 +2458,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2448
 
2449
  size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2450
  (void)hist;
2451
- int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2452
  if (!quant_weights) {
2453
  quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
2454
  }
@@ -2771,7 +2781,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2771
 
2772
  size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2773
  (void)hist;
2774
- int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2775
  if (!quant_weights) {
2776
  quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
2777
  }
@@ -3025,7 +3035,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
3025
 
3026
  size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3027
  (void)hist;
3028
- int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3029
  if (!quant_weights) {
3030
  quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
3031
  }
@@ -3072,7 +3082,7 @@ size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int
3072
  if (!quant_weights) {
3073
  return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
3074
  }
3075
- int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3076
  char * qrow = (char *)dst;
3077
  for (int row = 0; row < nrow; ++row) {
3078
  quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
@@ -3116,7 +3126,7 @@ size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int
3116
  if (!quant_weights) {
3117
  return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
3118
  }
3119
- int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3120
  char * qrow = (char *)dst;
3121
  for (int row = 0; row < nrow; ++row) {
3122
  quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
@@ -3169,7 +3179,7 @@ size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int
3169
  if (!quant_weights) {
3170
  return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
3171
  }
3172
- int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3173
  char * qrow = (char *)dst;
3174
  for (int row = 0; row < nrow; ++row) {
3175
  quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
@@ -3221,7 +3231,7 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
3221
  if (!quant_weights) {
3222
  return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
3223
  }
3224
- int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3225
  char * qrow = (char *)dst;
3226
  for (int row = 0; row < nrow; ++row) {
3227
  quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
@@ -8565,7 +8575,7 @@ static int iq2_compare_func(const void * left, const void * right) {
8565
  return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
8566
  }
8567
 
8568
- static void q2xs_init_impl(int grid_size) {
8569
  const int gindex = iq2_data_index(grid_size);
8570
  if (iq2_data[gindex].grid) {
8571
  return;
@@ -8720,19 +8730,7 @@ static void q2xs_init_impl(int grid_size) {
8720
  free(dist2);
8721
  }
8722
 
8723
- void ggml_init_iq2_quantization(enum ggml_type type) {
8724
- if (type == GGML_TYPE_IQ2_XXS) {
8725
- q2xs_init_impl(256);
8726
- }
8727
- else if (type == GGML_TYPE_IQ2_XS) {
8728
- q2xs_init_impl(512);
8729
- }
8730
- else {
8731
- fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8732
- }
8733
- }
8734
-
8735
- static void q2xs_deinit_impl(int grid_size) {
8736
  GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
8737
  const int gindex = iq2_data_index(grid_size);
8738
  if (iq2_data[gindex].grid) {
@@ -8742,18 +8740,6 @@ static void q2xs_deinit_impl(int grid_size) {
8742
  }
8743
  }
8744
 
8745
- void ggml_deinit_iq2_quantization(enum ggml_type type) {
8746
- if (type == GGML_TYPE_IQ2_XXS) {
8747
- q2xs_deinit_impl(256);
8748
- }
8749
- else if (type == GGML_TYPE_IQ2_XS) {
8750
- q2xs_deinit_impl(512);
8751
- }
8752
- else {
8753
- fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8754
- }
8755
- }
8756
-
8757
  static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
8758
  const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
8759
  int num_neighbors = neighbours[0];
@@ -8786,10 +8772,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
8786
  const int * kmap_q2xs = iq2_data[gindex].map;
8787
  const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
8788
 
8789
- GGML_ASSERT(quant_weights);
8790
- GGML_ASSERT(kgrid_q2xs);
8791
- GGML_ASSERT(kmap_q2xs);
8792
- GGML_ASSERT(kneighbors_q2xs);
8793
  GGML_ASSERT(n%QK_K == 0);
8794
 
8795
  const int kMaxQ = 3;
@@ -9005,10 +8991,10 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
9005
  const int * kmap_q2xs = iq2_data[gindex].map;
9006
  const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
9007
 
9008
- GGML_ASSERT(quant_weights);
9009
- GGML_ASSERT(kmap_q2xs);
9010
- GGML_ASSERT(kgrid_q2xs);
9011
- GGML_ASSERT(kneighbors_q2xs);
9012
  GGML_ASSERT(n%QK_K == 0);
9013
 
9014
  const int kMaxQ = 3;
 
1274
  }
1275
  float sumlx = 0;
1276
  float suml2 = 0;
1277
+ #ifdef HAVE_BUGGY_APPLE_LINKER
1278
+ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
1279
+ for (volatile int i = 0; i < n; ++i) {
1280
+ #else
1281
  for (int i = 0; i < n; ++i) {
1282
+ #endif
1283
  int l = nearest_int(iscale * x[i]);
1284
  l = MAX(-nmax, MIN(nmax-1, l));
1285
  L[i] = l + nmax;
 
1654
  float max = x[0];
1655
  float sum_w = weights ? weights[0] : x[0]*x[0];
1656
  float sum_x = sum_w * x[0];
1657
+ #ifdef HAVE_BUGGY_APPLE_LINKER
1658
+ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
1659
+ for (volatile int i = 1; i < n; ++i) {
1660
+ #else
1661
  for (int i = 1; i < n; ++i) {
1662
+ #endif
1663
  if (x[i] < min) min = x[i];
1664
  if (x[i] > max) max = x[i];
1665
  float w = weights ? weights[i] : x[i]*x[i];
 
1670
  min = 0;
1671
  }
1672
  if (max <= min) {
1673
+ memset(L, 0, n);
1674
  *the_min = -min;
1675
  return 0.f;
1676
  }
 
1872
 
1873
  size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
1874
  (void)hist;
1875
+ size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1876
  if (!quant_weights) {
1877
  quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
1878
  }
 
2191
 
2192
  size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2193
  (void)hist;
2194
+ size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2195
  if (!quant_weights) {
2196
  quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
2197
  }
 
2458
 
2459
  size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2460
  (void)hist;
2461
+ size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2462
  if (!quant_weights) {
2463
  quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
2464
  }
 
2781
 
2782
  size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2783
  (void)hist;
2784
+ size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2785
  if (!quant_weights) {
2786
  quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
2787
  }
 
3035
 
3036
  size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3037
  (void)hist;
3038
+ size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3039
  if (!quant_weights) {
3040
  quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
3041
  }
 
3082
  if (!quant_weights) {
3083
  return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
3084
  }
3085
+ size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3086
  char * qrow = (char *)dst;
3087
  for (int row = 0; row < nrow; ++row) {
3088
  quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
 
3126
  if (!quant_weights) {
3127
  return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
3128
  }
3129
+ size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3130
  char * qrow = (char *)dst;
3131
  for (int row = 0; row < nrow; ++row) {
3132
  quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
 
3179
  if (!quant_weights) {
3180
  return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
3181
  }
3182
+ size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3183
  char * qrow = (char *)dst;
3184
  for (int row = 0; row < nrow; ++row) {
3185
  quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
 
3231
  if (!quant_weights) {
3232
  return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
3233
  }
3234
+ size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3235
  char * qrow = (char *)dst;
3236
  for (int row = 0; row < nrow; ++row) {
3237
  quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
 
8575
  return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
8576
  }
8577
 
8578
+ void iq2xs_init_impl(int grid_size) {
8579
  const int gindex = iq2_data_index(grid_size);
8580
  if (iq2_data[gindex].grid) {
8581
  return;
 
8730
  free(dist2);
8731
  }
8732
 
8733
+ void iq2xs_free_impl(int grid_size) {
 
 
 
 
 
 
 
 
 
 
 
 
8734
  GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
8735
  const int gindex = iq2_data_index(grid_size);
8736
  if (iq2_data[gindex].grid) {
 
8740
  }
8741
  }
8742
 
 
 
 
 
 
 
 
 
 
 
 
 
8743
  static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
8744
  const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
8745
  int num_neighbors = neighbours[0];
 
8772
  const int * kmap_q2xs = iq2_data[gindex].map;
8773
  const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
8774
 
8775
+ GGML_ASSERT(quant_weights && "missing quantization weights");
8776
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
8777
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
8778
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
8779
  GGML_ASSERT(n%QK_K == 0);
8780
 
8781
  const int kMaxQ = 3;
 
8991
  const int * kmap_q2xs = iq2_data[gindex].map;
8992
  const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
8993
 
8994
+ GGML_ASSERT(quant_weights && "missing quantization weights");
8995
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
8996
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
8997
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
8998
  GGML_ASSERT(n%QK_K == 0);
8999
 
9000
  const int kMaxQ = 3;
ggml-quants.h CHANGED
@@ -257,3 +257,6 @@ size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row,
257
  size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
258
  size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
259
  size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 
 
 
 
257
  size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
258
  size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
259
  size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
260
+
261
+ void iq2xs_init_impl(int grid_size);
262
+ void iq2xs_free_impl(int grid_size);
ggml.c CHANGED
@@ -18524,6 +18524,28 @@ enum ggml_opt_result ggml_opt_resume_g(
18524
 
18525
  ////////////////////////////////////////////////////////////////////////////////
18526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18527
  size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
18528
  assert(k % QK4_0 == 0);
18529
  const int nb = k / QK4_0;
@@ -18651,9 +18673,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
18651
  return (n/QK8_0*sizeof(block_q8_0));
18652
  }
18653
 
 
 
 
 
 
 
18654
  size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
18655
  int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
18656
- (void)imatrix;
18657
  size_t result = 0;
18658
  int n = nrows * n_per_row;
18659
  switch (type) {
@@ -18766,13 +18794,13 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18766
  } break;
18767
  case GGML_TYPE_F16:
18768
  {
18769
- int elemsize = sizeof(ggml_fp16_t);
18770
  ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
18771
  result = n * elemsize;
18772
  } break;
18773
  case GGML_TYPE_F32:
18774
  {
18775
- int elemsize = sizeof(float);
18776
  result = n * elemsize;
18777
  memcpy((uint8_t *)dst + start * elemsize, src + start, result);
18778
  } break;
 
18524
 
18525
  ////////////////////////////////////////////////////////////////////////////////
18526
 
18527
+ void ggml_quantize_init(enum ggml_type type) {
18528
+ ggml_critical_section_start();
18529
+
18530
+ switch (type) {
18531
+ case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
18532
+ case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
18533
+ default: // nothing
18534
+ break;
18535
+ }
18536
+
18537
+ ggml_critical_section_end();
18538
+ }
18539
+
18540
+ void ggml_quantize_free(void) {
18541
+ ggml_critical_section_start();
18542
+
18543
+ iq2xs_free_impl(256);
18544
+ iq2xs_free_impl(512);
18545
+
18546
+ ggml_critical_section_end();
18547
+ }
18548
+
18549
  size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
18550
  assert(k % QK4_0 == 0);
18551
  const int nb = k / QK4_0;
 
18673
  return (n/QK8_0*sizeof(block_q8_0));
18674
  }
18675
 
18676
+ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
18677
+ return
18678
+ type == GGML_TYPE_IQ2_XXS ||
18679
+ type == GGML_TYPE_IQ2_XS;
18680
+ }
18681
+
18682
  size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
18683
  int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
18684
+ ggml_quantize_init(type); // this is noop if already initialized
18685
  size_t result = 0;
18686
  int n = nrows * n_per_row;
18687
  switch (type) {
 
18794
  } break;
18795
  case GGML_TYPE_F16:
18796
  {
18797
+ size_t elemsize = sizeof(ggml_fp16_t);
18798
  ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
18799
  result = n * elemsize;
18800
  } break;
18801
  case GGML_TYPE_F32:
18802
  {
18803
+ size_t elemsize = sizeof(float);
18804
  result = n * elemsize;
18805
  memcpy((uint8_t *)dst + start * elemsize, src + start, result);
18806
  } break;
ggml.h CHANGED
@@ -2065,6 +2065,18 @@ extern "C" {
2065
  // quantization
2066
  //
2067
 
 
 
 
 
 
 
 
 
 
 
 
 
2068
  // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
2069
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
2070
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -2078,13 +2090,13 @@ extern "C" {
2078
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2079
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2080
 
 
 
 
 
2081
  GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2082
  int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2083
 
2084
- // These are needed for IQ2_XS and IQ2_XXS quantizations
2085
- GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
2086
- GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
2087
-
2088
  //
2089
  // gguf
2090
  //
 
2065
  // quantization
2066
  //
2067
 
2068
+ // - ggml_quantize_init can be called multiple times with the same type
2069
+ // it will only initialize the quantization tables for the first call or after ggml_quantize_free
2070
+ // automatically called by ggml_quantize_chunk for convenience
2071
+ //
2072
+ // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
2073
+ // call this at the end of the program to avoid memory leaks
2074
+ //
2075
+ // note: these are thread-safe
2076
+ //
2077
+ GGML_API void ggml_quantize_init(enum ggml_type type);
2078
+ GGML_API void ggml_quantize_free(void);
2079
+
2080
  // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
2081
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
2082
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
 
2090
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2091
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2092
 
2093
+ // some quantization type cannot be used without an importance matrix
2094
+ GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2095
+
2096
+ // calls ggml_quantize_init internally (i.e. can allocate memory)
2097
  GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2098
  int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2099
 
 
 
 
 
2100
  //
2101
  // gguf
2102
  //