ggerganov commited on
Commit
7b8292f
·
unverified ·
1 Parent(s): d4425e8

ggml : sync latest ggml

Browse files
Files changed (2) hide show
  1. ggml.c +238 -11
  2. ggml.h +18 -0
ggml.c CHANGED
@@ -2712,9 +2712,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
2712
 
2713
  "FLASH_ATTN",
2714
  "FLASH_FF",
 
 
 
2715
  };
2716
 
2717
- static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
2718
 
2719
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2720
  "none",
@@ -2757,9 +2760,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2757
 
2758
  "flash_attn(x)",
2759
  "flash_ff(x)",
 
 
 
2760
  };
2761
 
2762
- static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
2763
 
2764
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
2765
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3054,9 +3060,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3054
  return NULL;
3055
  }
3056
 
 
 
3057
  *ctx = (struct ggml_context) {
3058
- /*.mem_size =*/ params.mem_size,
3059
- /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size),
3060
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3061
  /*.no_alloc =*/ params.no_alloc,
3062
  /*.n_objects =*/ 0,
@@ -3066,7 +3074,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3066
  /*.scratch_save =*/ { 0, 0, NULL, },
3067
  };
3068
 
3069
- GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
3070
 
3071
  ggml_assert_aligned(ctx->mem_buffer);
3072
 
@@ -4905,6 +4913,90 @@ struct ggml_tensor * ggml_flash_ff(
4905
  return result;
4906
  }
4907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4908
  ////////////////////////////////////////////////////////////////////////////////
4909
 
4910
  void ggml_set_param(
@@ -7507,6 +7599,8 @@ static void ggml_compute_forward_rope_f32(
7507
  // row index used to determine which thread to use
7508
  int ir = 0;
7509
 
 
 
7510
  for (int64_t i3 = 0; i3 < ne3; i3++) {
7511
  for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7512
  const int p = (mode == 0 ? n_past + i2 : i2);
@@ -7514,11 +7608,13 @@ static void ggml_compute_forward_rope_f32(
7514
  if (ir++ < ir0) continue;
7515
  if (ir > ir1) break;
7516
 
 
 
7517
  for (int i0 = 0; i0 < n_dims; i0 += 2) {
7518
- const float theta = powf(10000.0, ((float)-i0)/n_dims);
 
7519
 
7520
- const float cos_theta = cosf(p*theta);
7521
- const float sin_theta = sinf(p*theta);
7522
 
7523
  const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
7524
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@@ -7580,6 +7676,8 @@ static void ggml_compute_forward_rope_f16(
7580
  // row index used to determine which thread to use
7581
  int ir = 0;
7582
 
 
 
7583
  for (int64_t i3 = 0; i3 < ne3; i3++) {
7584
  for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7585
  const int p = (mode == 0 ? n_past + i2 : i2);
@@ -7587,11 +7685,13 @@ static void ggml_compute_forward_rope_f16(
7587
  if (ir++ < ir0) continue;
7588
  if (ir > ir1) break;
7589
 
 
 
7590
  for (int i0 = 0; i0 < n_dims; i0 += 2) {
7591
- const float theta = powf(10000.0, ((float)-i0)/n_dims);
 
7592
 
7593
- const float cos_theta = cosf(p*theta);
7594
- const float sin_theta = sinf(p*theta);
7595
 
7596
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
7597
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@@ -8865,6 +8965,111 @@ static void ggml_compute_forward_flash_ff(
8865
  }
8866
  }
8867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8868
  /////////////////////////////////
8869
 
8870
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
@@ -9014,6 +9219,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
9014
  {
9015
  ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
9016
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
9017
  case GGML_OP_NONE:
9018
  {
9019
  // nop
@@ -9273,6 +9490,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
9273
  {
9274
  GGML_ASSERT(false); // not supported
9275
  } break;
 
 
 
 
 
9276
  case GGML_OP_NONE:
9277
  {
9278
  // nop
@@ -9765,6 +9987,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
9765
 
9766
  work_size = MAX(work_size, cur);
9767
  } break;
 
 
 
 
 
9768
  case GGML_OP_NONE:
9769
  {
9770
  node->n_tasks = 1;
 
2712
 
2713
  "FLASH_ATTN",
2714
  "FLASH_FF",
2715
+
2716
+ "MAP_UNARY",
2717
+ "MAP_BINARY",
2718
  };
2719
 
2720
+ static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
2721
 
2722
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2723
  "none",
 
2760
 
2761
  "flash_attn(x)",
2762
  "flash_ff(x)",
2763
+
2764
+ "f(x)",
2765
+ "f(x,y)",
2766
  };
2767
 
2768
+ static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
2769
 
2770
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
2771
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
 
3060
  return NULL;
3061
  }
3062
 
3063
+ const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
3064
+
3065
  *ctx = (struct ggml_context) {
3066
+ /*.mem_size =*/ mem_size,
3067
+ /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
3068
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3069
  /*.no_alloc =*/ params.no_alloc,
3070
  /*.n_objects =*/ 0,
 
3074
  /*.scratch_save =*/ { 0, 0, NULL, },
3075
  };
3076
 
3077
+ GGML_ASSERT(ctx->mem_buffer != NULL);
3078
 
3079
  ggml_assert_aligned(ctx->mem_buffer);
3080
 
 
4913
  return result;
4914
  }
4915
 
4916
+ // ggml_map_unary
4917
+
4918
+ struct ggml_tensor * ggml_map_unary_impl_f32(
4919
+ struct ggml_context * ctx,
4920
+ struct ggml_tensor * a,
4921
+ const ggml_unary_op_f32_t fun,
4922
+ bool inplace) {
4923
+ bool is_node = false;
4924
+
4925
+ if (!inplace && a->grad) {
4926
+ is_node = true;
4927
+ }
4928
+
4929
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
4930
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
4931
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4932
+
4933
+ result->op = GGML_OP_MAP_UNARY;
4934
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4935
+ result->src0 = a;
4936
+ result->opt[0] = addr_tensor;
4937
+
4938
+ return result;
4939
+ }
4940
+
4941
+ struct ggml_tensor * ggml_map_unary_f32(
4942
+ struct ggml_context * ctx,
4943
+ struct ggml_tensor * a,
4944
+ const ggml_unary_op_f32_t fun) {
4945
+ return ggml_map_unary_impl_f32(ctx, a, fun, false);
4946
+ }
4947
+
4948
+ struct ggml_tensor * ggml_map_unary_inplace_f32(
4949
+ struct ggml_context * ctx,
4950
+ struct ggml_tensor * a,
4951
+ const ggml_unary_op_f32_t fun) {
4952
+ return ggml_map_unary_impl_f32(ctx, a, fun, true);
4953
+ }
4954
+
4955
+ // ggml_map_binary
4956
+
4957
+ struct ggml_tensor * ggml_map_binary_impl_f32(
4958
+ struct ggml_context * ctx,
4959
+ struct ggml_tensor * a,
4960
+ struct ggml_tensor * b,
4961
+ const ggml_binary_op_f32_t fun,
4962
+ bool inplace) {
4963
+ GGML_ASSERT(ggml_are_same_shape(a, b));
4964
+
4965
+ bool is_node = false;
4966
+
4967
+ if (!inplace && (a->grad || b->grad)) {
4968
+ is_node = true;
4969
+ }
4970
+
4971
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
4972
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
4973
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4974
+
4975
+ result->op = GGML_OP_MAP_BINARY;
4976
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4977
+ result->src0 = a;
4978
+ result->src1 = b;
4979
+ result->opt[0] = addr_tensor;
4980
+
4981
+ return result;
4982
+ }
4983
+
4984
+ struct ggml_tensor * ggml_map_binary_f32(
4985
+ struct ggml_context * ctx,
4986
+ struct ggml_tensor * a,
4987
+ struct ggml_tensor * b,
4988
+ const ggml_binary_op_f32_t fun) {
4989
+ return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
4990
+ }
4991
+
4992
+ struct ggml_tensor * ggml_map_binary_inplace_f32(
4993
+ struct ggml_context * ctx,
4994
+ struct ggml_tensor * a,
4995
+ struct ggml_tensor * b,
4996
+ const ggml_binary_op_f32_t fun) {
4997
+ return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
4998
+ }
4999
+
5000
  ////////////////////////////////////////////////////////////////////////////////
5001
 
5002
  void ggml_set_param(
 
7599
  // row index used to determine which thread to use
7600
  int ir = 0;
7601
 
7602
+ const float theta_scale = powf(10000.0, -2.0f/n_dims);
7603
+
7604
  for (int64_t i3 = 0; i3 < ne3; i3++) {
7605
  for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7606
  const int p = (mode == 0 ? n_past + i2 : i2);
 
7608
  if (ir++ < ir0) continue;
7609
  if (ir > ir1) break;
7610
 
7611
+ float theta = (float)p;
7612
+
7613
  for (int i0 = 0; i0 < n_dims; i0 += 2) {
7614
+ const float cos_theta = cosf(theta);
7615
+ const float sin_theta = sinf(theta);
7616
 
7617
+ theta *= theta_scale;
 
7618
 
7619
  const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
7620
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
 
7676
  // row index used to determine which thread to use
7677
  int ir = 0;
7678
 
7679
+ const float theta_scale = powf(10000.0, -2.0f/n_dims);
7680
+
7681
  for (int64_t i3 = 0; i3 < ne3; i3++) {
7682
  for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7683
  const int p = (mode == 0 ? n_past + i2 : i2);
 
7685
  if (ir++ < ir0) continue;
7686
  if (ir > ir1) break;
7687
 
7688
+ float theta = (float)p;
7689
+
7690
  for (int i0 = 0; i0 < n_dims; i0 += 2) {
7691
+ const float cos_theta = cosf(theta);
7692
+ const float sin_theta = sinf(theta);
7693
 
7694
+ theta *= theta_scale;
 
7695
 
7696
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
7697
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
 
8965
  }
8966
  }
8967
 
8968
+ // ggml_compute_forward_map_unary
8969
+
8970
+ static void ggml_compute_forward_map_unary_f32(
8971
+ const struct ggml_compute_params * params,
8972
+ const struct ggml_tensor * src0,
8973
+ struct ggml_tensor * dst,
8974
+ const ggml_unary_op_f32_t fun) {
8975
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
8976
+
8977
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8978
+ return;
8979
+ }
8980
+
8981
+ const int n = ggml_nrows(src0);
8982
+ const int nc = src0->ne[0];
8983
+
8984
+ assert( dst->nb[0] == sizeof(float));
8985
+ assert(src0->nb[0] == sizeof(float));
8986
+
8987
+ for (int i = 0; i < n; i++) {
8988
+ fun(nc,
8989
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
8990
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
8991
+ }
8992
+ }
8993
+
8994
+
8995
+ static void ggml_compute_forward_map_unary(
8996
+ const struct ggml_compute_params * params,
8997
+ const struct ggml_tensor * src0,
8998
+ struct ggml_tensor * dst,
8999
+ const ggml_unary_op_f32_t fun) {
9000
+ switch (src0->type) {
9001
+ case GGML_TYPE_F32:
9002
+ {
9003
+ ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
9004
+ } break;
9005
+ case GGML_TYPE_Q4_0:
9006
+ case GGML_TYPE_Q4_1:
9007
+ case GGML_TYPE_I8:
9008
+ case GGML_TYPE_I16:
9009
+ case GGML_TYPE_I32:
9010
+ case GGML_TYPE_F16:
9011
+ case GGML_TYPE_COUNT:
9012
+ {
9013
+ GGML_ASSERT(false);
9014
+ } break;
9015
+ }
9016
+ }
9017
+
9018
+ // ggml_compute_forward_map_binary
9019
+
9020
+ static void ggml_compute_forward_map_binary_f32(
9021
+ const struct ggml_compute_params * params,
9022
+ const struct ggml_tensor * src0,
9023
+ const struct ggml_tensor * src1,
9024
+ struct ggml_tensor * dst,
9025
+ const ggml_binary_op_f32_t fun) {
9026
+ assert(params->ith == 0);
9027
+ assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
9028
+
9029
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9030
+ return;
9031
+ }
9032
+
9033
+ const int n = ggml_nrows(src0);
9034
+ const int nc = src0->ne[0];
9035
+
9036
+ assert( dst->nb[0] == sizeof(float));
9037
+ assert(src0->nb[0] == sizeof(float));
9038
+ assert(src1->nb[0] == sizeof(float));
9039
+
9040
+ for (int i = 0; i < n; i++) {
9041
+ fun(nc,
9042
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
9043
+ (float *) ((char *) src0->data + i*(src0->nb[1])),
9044
+ (float *) ((char *) src1->data + i*(src1->nb[1])));
9045
+ }
9046
+ }
9047
+
9048
+
9049
+ static void ggml_compute_forward_map_binary(
9050
+ const struct ggml_compute_params * params,
9051
+ const struct ggml_tensor * src0,
9052
+ const struct ggml_tensor * src1,
9053
+ struct ggml_tensor * dst,
9054
+ const ggml_binary_op_f32_t fun) {
9055
+ switch (src0->type) {
9056
+ case GGML_TYPE_F32:
9057
+ {
9058
+ ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
9059
+ } break;
9060
+ case GGML_TYPE_Q4_0:
9061
+ case GGML_TYPE_Q4_1:
9062
+ case GGML_TYPE_I8:
9063
+ case GGML_TYPE_I16:
9064
+ case GGML_TYPE_I32:
9065
+ case GGML_TYPE_F16:
9066
+ case GGML_TYPE_COUNT:
9067
+ {
9068
+ GGML_ASSERT(false);
9069
+ } break;
9070
+ }
9071
+ }
9072
+
9073
  /////////////////////////////////
9074
 
9075
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
 
9219
  {
9220
  ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
9221
  } break;
9222
+ case GGML_OP_MAP_UNARY:
9223
+ {
9224
+ const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
9225
+ ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
9226
+ }
9227
+ break;
9228
+ case GGML_OP_MAP_BINARY:
9229
+ {
9230
+ const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
9231
+ ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
9232
+ }
9233
+ break;
9234
  case GGML_OP_NONE:
9235
  {
9236
  // nop
 
9490
  {
9491
  GGML_ASSERT(false); // not supported
9492
  } break;
9493
+ case GGML_OP_MAP_UNARY:
9494
+ case GGML_OP_MAP_BINARY:
9495
+ {
9496
+ GGML_ASSERT(false); // not supported
9497
+ } break;
9498
  case GGML_OP_NONE:
9499
  {
9500
  // nop
 
9987
 
9988
  work_size = MAX(work_size, cur);
9989
  } break;
9990
+ case GGML_OP_MAP_UNARY:
9991
+ case GGML_OP_MAP_BINARY:
9992
+ {
9993
+ node->n_tasks = 1;
9994
+ } break;
9995
  case GGML_OP_NONE:
9996
  {
9997
  node->n_tasks = 1;
ggml.h CHANGED
@@ -253,6 +253,9 @@ enum ggml_op {
253
  GGML_OP_FLASH_ATTN,
254
  GGML_OP_FLASH_FF,
255
 
 
 
 
256
  GGML_OP_COUNT,
257
  };
258
 
@@ -652,6 +655,21 @@ struct ggml_tensor * ggml_flash_ff(
652
  struct ggml_tensor * c0,
653
  struct ggml_tensor * c1);
654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
  //
656
  // automatic differentiation
657
  //
 
253
  GGML_OP_FLASH_ATTN,
254
  GGML_OP_FLASH_FF,
255
 
256
+ GGML_OP_MAP_UNARY,
257
+ GGML_OP_MAP_BINARY,
258
+
259
  GGML_OP_COUNT,
260
  };
261
 
 
655
  struct ggml_tensor * c0,
656
  struct ggml_tensor * c1);
657
 
658
+ // Mapping operations
659
+ typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
660
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
661
+
662
+ struct ggml_tensor * ggml_map_unary_f32(
663
+ struct ggml_context * ctx,
664
+ struct ggml_tensor * a,
665
+ const ggml_unary_op_f32_t fun);
666
+
667
+ struct ggml_tensor * ggml_map_binary_f32(
668
+ struct ggml_context * ctx,
669
+ struct ggml_tensor * a,
670
+ struct ggml_tensor * b,
671
+ const ggml_binary_op_f32_t fun);
672
+
673
  //
674
  // automatic differentiation
675
  //