leejet ggerganov slaren commited on
Commit
dd8e3f9
·
unverified ·
1 Parent(s): 98b861a

add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

Browse files

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <[email protected]>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <[email protected]>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <[email protected]>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <[email protected]>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <[email protected]>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <[email protected]>
Co-authored-by: slaren <[email protected]>

Files changed (5) hide show
  1. ggml-cuda.cu +187 -42
  2. ggml-metal.m +59 -3
  3. ggml-metal.metal +43 -0
  4. ggml.c +199 -8
  5. ggml.h +17 -0
ggml-cuda.cu CHANGED
@@ -616,6 +616,8 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
616
  #define CUDA_UPSCALE_BLOCK_SIZE 256
617
  #define CUDA_CONCAT_BLOCK_SIZE 256
618
  #define CUDA_PAD_BLOCK_SIZE 256
 
 
619
  #define CUDA_ACC_BLOCK_SIZE 256
620
  #define CUDA_IM2COL_BLOCK_SIZE 256
621
  #define CUDA_POOL2D_BLOCK_SIZE 256
@@ -990,17 +992,21 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
990
  nidx +
991
  blockIdx.y * ne0 +
992
  blockIdx.z * ne0 * gridDim.y;
993
- dst[offset_dst] = x[offset_src];
994
  } else {
995
  int offset_src =
996
  nidx +
997
  blockIdx.y * ne0 +
998
  (blockIdx.z - ne02) * ne0 * gridDim.y;
999
- dst[offset_dst] = y[offset_src];
1000
  }
1001
  }
1002
 
1003
- static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
 
 
 
 
1004
  int ne0 = ne00 * scale_factor;
1005
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
1006
  if (nidx >= ne0) {
@@ -1012,7 +1018,7 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
1012
  int offset_src =
1013
  i00 +
1014
  i01 * ne00 +
1015
- blockIdx.z * nb02;
1016
  int offset_dst =
1017
  nidx +
1018
  blockIdx.y * ne0 +
@@ -1020,7 +1026,10 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
1020
  dst[offset_dst] = x[offset_src];
1021
  }
1022
 
1023
- static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
 
 
 
1024
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
1025
  if (nidx >= ne0) {
1026
  return;
@@ -1031,19 +1040,53 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
1031
  nidx +
1032
  blockIdx.y * ne0 +
1033
  blockIdx.z * ne0 * gridDim.y;
1034
- if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
1035
  int offset_src =
1036
  nidx +
1037
  blockIdx.y * ne00 +
1038
  blockIdx.z * ne00 * ne01;
1039
- dst[offset_dst] = x[offset_src];
1040
  } else {
1041
  dst[offset_dst] = 0.0f;
1042
  }
1043
  }
1044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1045
  template <int block_size>
1046
  static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
 
 
1047
  int start = blockIdx.x * group_size;
1048
  int end = start + group_size;
1049
 
@@ -6449,7 +6492,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
6449
  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
6450
  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
6451
  const int nb12, const int nb13) {
6452
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
6453
 
6454
  if (i >= ne) {
6455
  return;
@@ -6457,17 +6500,17 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
6457
 
6458
  // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
6459
  // then combine those indices with the corresponding byte offsets to get the total offsets
6460
- const int i03 = i/(ne00 * ne01 * ne02);
6461
- const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
6462
- const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
6463
- const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
6464
- const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
6465
-
6466
- const int i13 = i/(ne10 * ne11 * ne12);
6467
- const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
6468
- const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
6469
- const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
6470
- const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
6471
 
6472
  cpy_1(cx + x_offset, cdst + dst_offset);
6473
  }
@@ -6956,23 +6999,23 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
6956
 
6957
  template <typename T>
6958
  static __global__ void im2col_kernel(
6959
- const float * x, T * dst, int batch_offset,
6960
- int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
6961
  int s0, int s1, int p0, int p1, int d0, int d1) {
6962
- const int i = threadIdx.x + blockIdx.x * blockDim.x;
6963
  if (i >= pelements) {
6964
  return;
6965
  }
6966
 
6967
- const int ksize = OW * (KH > 1 ? KW : 1);
6968
- const int kx = i / ksize;
6969
- const int kd = kx * ksize;
6970
- const int ky = (i - kd) / OW;
6971
- const int ix = i % OW;
6972
 
6973
- const int oh = blockIdx.y;
6974
- const int batch = blockIdx.z / IC;
6975
- const int ic = blockIdx.z % IC;
6976
 
6977
  const int64_t iiw = ix * s0 + kx * d0 - p0;
6978
  const int64_t iih = oh * s1 + ky * d1 - p1;
@@ -7298,19 +7341,33 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, const
7298
  concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
7299
  }
7300
 
7301
- static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
 
7302
  int ne0 = (ne00 * scale_factor);
7303
  int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
7304
- dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
7305
  upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
7306
  }
7307
 
7308
  static void pad_f32_cuda(const float * x, float * dst,
7309
- const int ne00, const int ne01, const int ne02,
7310
- const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
7311
  int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
7312
- dim3 gridDim(num_blocks, ne1, ne2);
7313
- pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
 
 
 
 
 
 
 
 
 
 
 
 
 
7314
  }
7315
 
7316
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
@@ -8443,8 +8500,8 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float *
8443
 
8444
  template <typename T>
8445
  static void im2col_cuda(const float* x, T* dst,
8446
- int IW, int IH, int OW, int OH, int KW, int KH, int IC,
8447
- int batch, int batch_offset, int offset_delta,
8448
  int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
8449
  const int parallel_elements = OW * KW * KH;
8450
  const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
@@ -9123,7 +9180,7 @@ static void ggml_cuda_op_group_norm(
9123
 
9124
  int num_groups = dst->op_params[0];
9125
  int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
9126
- group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
9127
 
9128
  (void) src1;
9129
  (void) dst;
@@ -9156,7 +9213,7 @@ static void ggml_cuda_op_upscale(
9156
 
9157
  const int scale_factor = dst->op_params[0];
9158
 
9159
- upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
9160
 
9161
  (void) src1;
9162
  (void) dst;
@@ -9172,8 +9229,49 @@ static void ggml_cuda_op_pad(
9172
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
9173
 
9174
  pad_f32_cuda(src0_dd, dst_dd,
9175
- src0->ne[0], src0->ne[1], src0->ne[2],
9176
- dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9177
 
9178
  (void) src1;
9179
  (void) dst;
@@ -10458,6 +10556,45 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
10458
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
10459
  }
10460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10461
  static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10462
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
10463
  }
@@ -11358,6 +11495,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
11358
  case GGML_OP_PAD:
11359
  func = ggml_cuda_pad;
11360
  break;
 
 
 
 
 
 
11361
  case GGML_OP_LEAKY_RELU:
11362
  func = ggml_cuda_leaky_relu;
11363
  break;
@@ -12253,6 +12396,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
12253
  case GGML_OP_GROUP_NORM:
12254
  case GGML_OP_UPSCALE:
12255
  case GGML_OP_PAD:
 
 
12256
  case GGML_OP_LEAKY_RELU:
12257
  return true;
12258
  default:
 
616
  #define CUDA_UPSCALE_BLOCK_SIZE 256
617
  #define CUDA_CONCAT_BLOCK_SIZE 256
618
  #define CUDA_PAD_BLOCK_SIZE 256
619
+ #define CUDA_ARANGE_BLOCK_SIZE 256
620
+ #define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
621
  #define CUDA_ACC_BLOCK_SIZE 256
622
  #define CUDA_IM2COL_BLOCK_SIZE 256
623
  #define CUDA_POOL2D_BLOCK_SIZE 256
 
992
  nidx +
993
  blockIdx.y * ne0 +
994
  blockIdx.z * ne0 * gridDim.y;
995
+ dst[offset_dst] = x[offset_src];
996
  } else {
997
  int offset_src =
998
  nidx +
999
  blockIdx.y * ne0 +
1000
  (blockIdx.z - ne02) * ne0 * gridDim.y;
1001
+ dst[offset_dst] = y[offset_src];
1002
  }
1003
  }
1004
 
1005
+ static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
1006
+ // blockIdx.z: idx of ne02*ne03
1007
+ // blockIdx.y: idx of ne01*scale_factor, aka ne1
1008
+ // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
1009
+ // ne00xne01: ne00 * ne01
1010
  int ne0 = ne00 * scale_factor;
1011
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
1012
  if (nidx >= ne0) {
 
1018
  int offset_src =
1019
  i00 +
1020
  i01 * ne00 +
1021
+ blockIdx.z * ne00xne01;
1022
  int offset_dst =
1023
  nidx +
1024
  blockIdx.y * ne0 +
 
1026
  dst[offset_dst] = x[offset_src];
1027
  }
1028
 
1029
+ static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
1030
+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
1031
+ // blockIdx.y: idx of ne1
1032
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
1033
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
1034
  if (nidx >= ne0) {
1035
  return;
 
1040
  nidx +
1041
  blockIdx.y * ne0 +
1042
  blockIdx.z * ne0 * gridDim.y;
1043
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
1044
  int offset_src =
1045
  nidx +
1046
  blockIdx.y * ne00 +
1047
  blockIdx.z * ne00 * ne01;
1048
+ dst[offset_dst] = x[offset_src];
1049
  } else {
1050
  dst[offset_dst] = 0.0f;
1051
  }
1052
  }
1053
 
1054
+ static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
1055
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
1056
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
1057
+ if (nidx >= ne0) {
1058
+ return;
1059
+ }
1060
+ dst[nidx] = start + step * nidx;
1061
+ }
1062
+
1063
+ static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
1064
+ // blockIDx.y: idx of timesteps->ne[0]
1065
+ // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
1066
+ int i = blockIdx.y;
1067
+ int j = threadIdx.x + blockIdx.x * blockDim.x;
1068
+ float * embed_data = (float *)((char *)dst + i*nb1);
1069
+
1070
+ if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
1071
+ embed_data[dim] = 0.f;
1072
+ }
1073
+
1074
+ int half = dim / 2;
1075
+ if (j >= half) {
1076
+ return;
1077
+ }
1078
+
1079
+ float timestep = timesteps[i];
1080
+ float freq = (float)expf(-logf(max_period) * j / half);
1081
+ float arg = timestep * freq;
1082
+ embed_data[j] = cosf(arg);
1083
+ embed_data[j + half] = sinf(arg);
1084
+ }
1085
+
1086
  template <int block_size>
1087
  static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
1088
+ // blockIdx.x: num_groups idx
1089
+ // threadIdx.x: block_size idx
1090
  int start = blockIdx.x * group_size;
1091
  int end = start + group_size;
1092
 
 
6492
  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
6493
  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
6494
  const int nb12, const int nb13) {
6495
+ const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
6496
 
6497
  if (i >= ne) {
6498
  return;
 
6500
 
6501
  // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
6502
  // then combine those indices with the corresponding byte offsets to get the total offsets
6503
+ const int64_t i03 = i/(ne00 * ne01 * ne02);
6504
+ const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
6505
+ const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
6506
+ const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
6507
+ const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
6508
+
6509
+ const int64_t i13 = i/(ne10 * ne11 * ne12);
6510
+ const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
6511
+ const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
6512
+ const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
6513
+ const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
6514
 
6515
  cpy_1(cx + x_offset, cdst + dst_offset);
6516
  }
 
6999
 
7000
  template <typename T>
7001
  static __global__ void im2col_kernel(
7002
+ const float * x, T * dst, int64_t batch_offset,
7003
+ int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
7004
  int s0, int s1, int p0, int p1, int d0, int d1) {
7005
+ const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
7006
  if (i >= pelements) {
7007
  return;
7008
  }
7009
 
7010
+ const int64_t ksize = OW * (KH > 1 ? KW : 1);
7011
+ const int64_t kx = i / ksize;
7012
+ const int64_t kd = kx * ksize;
7013
+ const int64_t ky = (i - kd) / OW;
7014
+ const int64_t ix = i % OW;
7015
 
7016
+ const int64_t oh = blockIdx.y;
7017
+ const int64_t batch = blockIdx.z / IC;
7018
+ const int64_t ic = blockIdx.z % IC;
7019
 
7020
  const int64_t iiw = ix * s0 + kx * d0 - p0;
7021
  const int64_t iih = oh * s1 + ky * d1 - p1;
 
7341
  concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
7342
  }
7343
 
7344
+ static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
7345
+ const int scale_factor, cudaStream_t stream) {
7346
  int ne0 = (ne00 * scale_factor);
7347
  int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
7348
+ dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
7349
  upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
7350
  }
7351
 
7352
  static void pad_f32_cuda(const float * x, float * dst,
7353
+ const int ne00, const int ne01, const int ne02, const int ne03,
7354
+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
7355
  int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
7356
+ dim3 gridDim(num_blocks, ne1, ne2*ne3);
7357
+ pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
7358
+ }
7359
+
7360
+ static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
7361
+ int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
7362
+ arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
7363
+ }
7364
+
7365
+ static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
7366
+ const int dim, const int max_period, cudaStream_t stream) {
7367
+ int half_ceil = (dim + 1) / 2;
7368
+ int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
7369
+ dim3 gridDim(num_blocks, ne00, 1);
7370
+ timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
7371
  }
7372
 
7373
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
 
8500
 
8501
  template <typename T>
8502
  static void im2col_cuda(const float* x, T* dst,
8503
+ int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
8504
+ int64_t batch, int64_t batch_offset, int64_t offset_delta,
8505
  int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
8506
  const int parallel_elements = OW * KW * KH;
8507
  const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
 
9180
 
9181
  int num_groups = dst->op_params[0];
9182
  int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
9183
+ group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
9184
 
9185
  (void) src1;
9186
  (void) dst;
 
9213
 
9214
  const int scale_factor = dst->op_params[0];
9215
 
9216
+ upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
9217
 
9218
  (void) src1;
9219
  (void) dst;
 
9229
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
9230
 
9231
  pad_f32_cuda(src0_dd, dst_dd,
9232
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
9233
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
9234
+
9235
+ (void) src1;
9236
+ (void) dst;
9237
+ (void) src1_dd;
9238
+ }
9239
+
9240
+ static void ggml_cuda_op_arange(
9241
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
9242
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
9243
+
9244
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
9245
+
9246
+ float start;
9247
+ float stop;
9248
+ float step;
9249
+ memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
9250
+ memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
9251
+ memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
9252
+
9253
+ int64_t steps = (int64_t)ceil((stop - start) / step);
9254
+ GGML_ASSERT(ggml_nelements(dst) == steps);
9255
+
9256
+ arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
9257
+
9258
+ (void) src0;
9259
+ (void) src1;
9260
+ (void) src0_dd;
9261
+ (void) src1_dd;
9262
+ }
9263
+
9264
+ static void ggml_cuda_op_timestep_embedding(
9265
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
9266
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
9267
+
9268
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
9269
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
9270
+
9271
+ const int dim = dst->op_params[0];
9272
+ const int max_period = dst->op_params[1];
9273
+
9274
+ timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
9275
 
9276
  (void) src1;
9277
  (void) dst;
 
10556
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
10557
  }
10558
 
10559
+ static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10560
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
10561
+
10562
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
10563
+
10564
+ // dd = data device
10565
+ float * src0_ddf = nullptr;
10566
+ float * src1_ddf = nullptr;
10567
+ float * dst_ddf = nullptr;
10568
+
10569
+ cuda_pool_alloc<float> dst_f;
10570
+
10571
+ ggml_cuda_set_device(g_main_device);
10572
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
10573
+
10574
+ if (dst_on_device) {
10575
+ dst_ddf = (float *) dst_extra->data_device[g_main_device];
10576
+ } else {
10577
+ dst_ddf = dst_f.alloc(ggml_nelements(dst));
10578
+ }
10579
+
10580
+ // do the computation
10581
+ ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
10582
+ CUDA_CHECK(cudaGetLastError());
10583
+
10584
+ // copy dst to host if necessary
10585
+ if (!dst_on_device) {
10586
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
10587
+ }
10588
+
10589
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
10590
+ CUDA_CHECK(cudaDeviceSynchronize());
10591
+ }
10592
+ }
10593
+
10594
+ static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10595
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
10596
+ }
10597
+
10598
  static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10599
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
10600
  }
 
11495
  case GGML_OP_PAD:
11496
  func = ggml_cuda_pad;
11497
  break;
11498
+ case GGML_OP_ARANGE:
11499
+ func = ggml_cuda_arange;
11500
+ break;
11501
+ case GGML_OP_TIMESTEP_EMBEDDING:
11502
+ func = ggml_cuda_timestep_embedding;
11503
+ break;
11504
  case GGML_OP_LEAKY_RELU:
11505
  func = ggml_cuda_leaky_relu;
11506
  break;
 
12396
  case GGML_OP_GROUP_NORM:
12397
  case GGML_OP_UPSCALE:
12398
  case GGML_OP_PAD:
12399
+ case GGML_OP_ARANGE:
12400
+ case GGML_OP_TIMESTEP_EMBEDDING:
12401
  case GGML_OP_LEAKY_RELU:
12402
  return true;
12403
  default:
ggml-metal.m CHANGED
@@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
163
  GGML_METAL_KERNEL_TYPE_IM2COL_F32,
164
  GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
165
  GGML_METAL_KERNEL_TYPE_PAD_F32,
 
 
166
  GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
167
  GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
168
  GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
@@ -569,6 +571,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
569
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
570
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
571
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
 
 
572
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
573
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
574
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
@@ -697,6 +701,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
697
  return false;
698
  case GGML_OP_UPSCALE:
699
  case GGML_OP_PAD:
 
 
700
  case GGML_OP_ARGSORT:
701
  case GGML_OP_LEAKY_RELU:
702
  return true;
@@ -1091,7 +1097,8 @@ static bool ggml_metal_graph_compute(
1091
  {
1092
  GGML_ASSERT(ggml_is_contiguous(src0));
1093
 
1094
- const float scale = *(const float *) dst->op_params;
 
1095
 
1096
  int64_t n = ggml_nelements(dst);
1097
 
@@ -1250,11 +1257,15 @@ static bool ggml_metal_graph_compute(
1250
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
1251
  }
1252
 
1253
- const float scale = ((float *) dst->op_params)[0];
1254
- const float max_bias = ((float *) dst->op_params)[1];
 
 
 
1255
 
1256
  const int64_t nrows_x = ggml_nrows(src0);
1257
  const int64_t nrows_y = src0->ne[1];
 
1258
  const uint32_t n_head_kv = nrows_x/nrows_y;
1259
  const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
1260
 
@@ -2086,6 +2097,7 @@ static bool ggml_metal_graph_compute(
2086
 
2087
  //const int n_past = ((int32_t *) dst->op_params)[0];
2088
  const int n_head = ((int32_t *) dst->op_params)[1];
 
2089
  float max_bias;
2090
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
2091
 
@@ -2300,6 +2312,50 @@ static bool ggml_metal_graph_compute(
2300
 
2301
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2302
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2303
  case GGML_OP_ARGSORT:
2304
  {
2305
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
163
  GGML_METAL_KERNEL_TYPE_IM2COL_F32,
164
  GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
165
  GGML_METAL_KERNEL_TYPE_PAD_F32,
166
+ GGML_METAL_KERNEL_TYPE_ARANGE_F32,
167
+ GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
168
  GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
169
  GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
170
  GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
 
571
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
572
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
573
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
574
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
575
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
576
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
577
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
578
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
 
701
  return false;
702
  case GGML_OP_UPSCALE:
703
  case GGML_OP_PAD:
704
+ case GGML_OP_ARANGE:
705
+ case GGML_OP_TIMESTEP_EMBEDDING:
706
  case GGML_OP_ARGSORT:
707
  case GGML_OP_LEAKY_RELU:
708
  return true;
 
1097
  {
1098
  GGML_ASSERT(ggml_is_contiguous(src0));
1099
 
1100
+ float scale;
1101
+ memcpy(&scale, dst->op_params, sizeof(scale));
1102
 
1103
  int64_t n = ggml_nelements(dst);
1104
 
 
1257
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
1258
  }
1259
 
1260
+ float scale;
1261
+ float max_bias;
1262
+
1263
+ memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
1264
+ memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
1265
 
1266
  const int64_t nrows_x = ggml_nrows(src0);
1267
  const int64_t nrows_y = src0->ne[1];
1268
+
1269
  const uint32_t n_head_kv = nrows_x/nrows_y;
1270
  const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
1271
 
 
2097
 
2098
  //const int n_past = ((int32_t *) dst->op_params)[0];
2099
  const int n_head = ((int32_t *) dst->op_params)[1];
2100
+
2101
  float max_bias;
2102
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
2103
 
 
2312
 
2313
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2314
  } break;
2315
+ case GGML_OP_ARANGE:
2316
+ {
2317
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
2318
+
2319
+ float start;
2320
+ float step;
2321
+
2322
+ memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
2323
+ memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float));
2324
+
2325
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
2326
+
2327
+ [encoder setComputePipelineState:pipeline];
2328
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:0];
2329
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
2330
+ [encoder setBytes:&start length:sizeof(start) atIndex:2];
2331
+ [encoder setBytes:&step length:sizeof(step) atIndex:3];
2332
+
2333
+ const int nth = MIN(1024, ne0);
2334
+
2335
+ [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2336
+ } break;
2337
+ case GGML_OP_TIMESTEP_EMBEDDING:
2338
+ {
2339
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
2340
+
2341
+ const int dim = dst->op_params[0];
2342
+ const int max_period = dst->op_params[1];
2343
+
2344
+ const int half = dim / 2;
2345
+
2346
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
2347
+
2348
+ [encoder setComputePipelineState:pipeline];
2349
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
2350
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
2351
+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
2352
+ [encoder setBytes:&dim length:sizeof(dim) atIndex:3];
2353
+ [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
2354
+
2355
+ const int nth = MIN(1024, half);
2356
+
2357
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2358
+ } break;
2359
  case GGML_OP_ARGSORT:
2360
  {
2361
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
ggml-metal.metal CHANGED
@@ -1959,6 +1959,49 @@ kernel void kernel_pad_f32(
1959
  }
1960
  }
1961
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1962
  // bitonic sort implementation following the CUDA kernels as reference
1963
  typedef void (argsort_t)(
1964
  device const float * x,
 
1959
  }
1960
  }
1961
 
1962
+ kernel void kernel_arange_f32(
1963
+ device char * dst,
1964
+ constant int64_t & ne0,
1965
+ constant float & start,
1966
+ constant float & step,
1967
+ uint3 tgpig[[threadgroup_position_in_grid]],
1968
+ uint3 tpitg[[thread_position_in_threadgroup]],
1969
+ uint3 ntg[[threads_per_threadgroup]]) {
1970
+
1971
+ device float * dst_ptr = (device float *) dst;
1972
+
1973
+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
1974
+ dst_ptr[i0] = start + step * i0;
1975
+ }
1976
+ }
1977
+
1978
+ kernel void kernel_timestep_embedding_f32(
1979
+ device const char * src0,
1980
+ device char * dst,
1981
+ constant uint64_t & nb1,
1982
+ constant int & dim,
1983
+ constant int & max_period,
1984
+ uint3 tgpig[[threadgroup_position_in_grid]],
1985
+ uint3 tpitg[[thread_position_in_threadgroup]],
1986
+ uint3 ntg[[threads_per_threadgroup]]) {
1987
+
1988
+ int i = tgpig.x;
1989
+ device float * embed_data = (device float *)(dst + i*nb1);
1990
+
1991
+ int half_ = dim / 2;
1992
+ for (int j = tpitg.x; j < half_; j += ntg.x) {
1993
+ float timestep = ((device float *)src0)[i];
1994
+ float freq = (float)exp(-log((float)max_period) * j / half_);
1995
+ float arg = timestep * freq;
1996
+ embed_data[j ] = cos(arg);
1997
+ embed_data[j + half_] = sin(arg);
1998
+ }
1999
+
2000
+ if (dim % 2 != 0 && tpitg.x == 0) {
2001
+ embed_data[dim] = 0.f;
2002
+ }
2003
+ }
2004
+
2005
  // bitonic sort implementation following the CUDA kernels as reference
2006
  typedef void (argsort_t)(
2007
  device const float * x,
ggml.c CHANGED
@@ -1822,6 +1822,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1822
  "POOL_2D",
1823
  "UPSCALE",
1824
  "PAD",
 
 
1825
  "ARGSORT",
1826
  "LEAKY_RELU",
1827
 
@@ -1850,7 +1852,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1850
  "CROSS_ENTROPY_LOSS_BACK",
1851
  };
1852
 
1853
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1854
 
1855
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1856
  "none",
@@ -1908,6 +1910,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1908
  "pool_2d(x)",
1909
  "upscale(x)",
1910
  "pad(x)",
 
 
1911
  "argsort(x)",
1912
  "leaky_relu(x)",
1913
 
@@ -1936,7 +1940,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1936
  "cross_entropy_loss_back(x,y)",
1937
  };
1938
 
1939
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1940
 
1941
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1942
 
@@ -2895,11 +2899,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
2895
  return ((const int32_t *)(tensor->op_params))[i];
2896
  }
2897
 
 
 
 
 
 
2898
  static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
2899
  assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
2900
  ((int32_t *)(tensor->op_params))[i] = value;
2901
  }
2902
 
 
 
 
 
 
2903
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
2904
  memset(tensor->data, 0, ggml_nbytes(tensor));
2905
  return tensor;
@@ -5898,6 +5912,55 @@ struct ggml_tensor * ggml_upscale(
5898
  return ggml_upscale_impl(ctx, a, scale_factor);
5899
  }
5900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5901
  // ggml_argsort
5902
 
5903
  struct ggml_tensor * ggml_argsort(
@@ -10231,7 +10294,7 @@ static void ggml_compute_forward_group_norm_f32(
10231
  int n_channels = src0->ne[2];
10232
  int n_groups = dst->op_params[0];
10233
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
10234
- for (int i = ith; i < n_groups; i+=nth) {
10235
  int start = i * n_channels_per_group;
10236
  int end = start + n_channels_per_group;
10237
  if (end > n_channels) {
@@ -10245,28 +10308,32 @@ static void ggml_compute_forward_group_norm_f32(
10245
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10246
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10247
 
 
10248
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10249
- sum += (ggml_float)x[i00];
10250
  }
 
10251
  }
10252
  }
10253
- float mean = sum / (ne00 * ne01 * step);
10254
- ggml_float sum2 = 0.0;
10255
 
 
10256
  for (int64_t i02 = start; i02 < end; i02++) {
10257
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10258
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10259
 
10260
  float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10261
 
 
10262
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10263
  float v = x[i00] - mean;
10264
  y[i00] = v;
10265
- sum2 += (ggml_float)(v * v);
10266
  }
 
10267
  }
10268
  }
10269
- float variance = sum2 / (ne00 * ne01 * step);
10270
  const float scale = 1.0f / sqrtf(variance + eps);
10271
 
10272
  for (int64_t i02 = start; i02 < end; i02++) {
@@ -13547,6 +13614,106 @@ static void ggml_compute_forward_pad(
13547
  }
13548
  }
13549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13550
  // ggml_compute_forward_argsort
13551
 
13552
  static void ggml_compute_forward_argsort_f32(
@@ -15615,6 +15782,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15615
  {
15616
  ggml_compute_forward_pad(params, tensor);
15617
  } break;
 
 
 
 
 
 
 
 
15618
  case GGML_OP_ARGSORT:
15619
  {
15620
  ggml_compute_forward_argsort(params, tensor);
@@ -16617,6 +16792,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16617
  {
16618
  GGML_ASSERT(false); // TODO: not implemented
16619
  } break;
 
 
 
 
 
 
 
 
16620
  case GGML_OP_ARGSORT:
16621
  {
16622
  GGML_ASSERT(false); // TODO: not implemented
@@ -17368,6 +17551,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17368
  {
17369
  n_tasks = n_threads;
17370
  } break;
 
 
 
 
 
 
 
 
17371
  case GGML_OP_ARGSORT:
17372
  {
17373
  n_tasks = n_threads;
 
1822
  "POOL_2D",
1823
  "UPSCALE",
1824
  "PAD",
1825
+ "ARANGE",
1826
+ "TIMESTEP_EMBEDDING",
1827
  "ARGSORT",
1828
  "LEAKY_RELU",
1829
 
 
1852
  "CROSS_ENTROPY_LOSS_BACK",
1853
  };
1854
 
1855
+ static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
1856
 
1857
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1858
  "none",
 
1910
  "pool_2d(x)",
1911
  "upscale(x)",
1912
  "pad(x)",
1913
+ "arange(start, stop, step)",
1914
+ "timestep_embedding(timesteps, dim, max_period)",
1915
  "argsort(x)",
1916
  "leaky_relu(x)",
1917
 
 
1940
  "cross_entropy_loss_back(x,y)",
1941
  };
1942
 
1943
+ static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
1944
 
1945
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1946
 
 
2899
  return ((const int32_t *)(tensor->op_params))[i];
2900
  }
2901
 
2902
+ static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
2903
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2904
+ return ((const float *)(tensor->op_params))[i];
2905
+ }
2906
+
2907
  static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
2908
  assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
2909
  ((int32_t *)(tensor->op_params))[i] = value;
2910
  }
2911
 
2912
+ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
2913
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2914
+ ((float *)(tensor->op_params))[i] = value;
2915
+ }
2916
+
2917
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
2918
  memset(tensor->data, 0, ggml_nbytes(tensor));
2919
  return tensor;
 
5912
  return ggml_upscale_impl(ctx, a, scale_factor);
5913
  }
5914
 
5915
+ struct ggml_tensor * ggml_arange(
5916
+ struct ggml_context * ctx,
5917
+ float start,
5918
+ float stop,
5919
+ float step) {
5920
+
5921
+ GGML_ASSERT(stop > start);
5922
+
5923
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
5924
+
5925
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
5926
+
5927
+ result->op = GGML_OP_ARANGE;
5928
+ ggml_set_op_params_f32(result, 0, start);
5929
+ ggml_set_op_params_f32(result, 1, stop);
5930
+ ggml_set_op_params_f32(result, 2, step);
5931
+
5932
+ return result;
5933
+ }
5934
+
5935
+ struct ggml_tensor * ggml_timestep_embedding(
5936
+ struct ggml_context * ctx,
5937
+ struct ggml_tensor * timesteps,
5938
+ int dim,
5939
+ int max_period) {
5940
+ bool is_node = false;
5941
+
5942
+ if (timesteps->grad) {
5943
+ GGML_ASSERT(false); // TODO: implement backward
5944
+ is_node = true;
5945
+ }
5946
+
5947
+ int actual_dim = dim;
5948
+ if (dim % 2 != 0) {
5949
+ actual_dim = dim + 1;
5950
+ }
5951
+
5952
+ struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
5953
+
5954
+ result->op = GGML_OP_TIMESTEP_EMBEDDING;
5955
+ ggml_set_op_params_i32(result, 0, dim);
5956
+ ggml_set_op_params_i32(result, 1, max_period);
5957
+
5958
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5959
+ result->src[0] = timesteps;
5960
+
5961
+ return result;
5962
+ }
5963
+
5964
  // ggml_argsort
5965
 
5966
  struct ggml_tensor * ggml_argsort(
 
10294
  int n_channels = src0->ne[2];
10295
  int n_groups = dst->op_params[0];
10296
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
10297
+ for (int i = ith; i < n_groups; i += nth) {
10298
  int start = i * n_channels_per_group;
10299
  int end = start + n_channels_per_group;
10300
  if (end > n_channels) {
 
10308
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10309
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10310
 
10311
+ ggml_float sumr = 0.0;
10312
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10313
+ sumr += (ggml_float)x[i00];
10314
  }
10315
+ sum += sumr;
10316
  }
10317
  }
10318
+ const float mean = sum / (ne00 * ne01 * step);
 
10319
 
10320
+ ggml_float sum2 = 0.0;
10321
  for (int64_t i02 = start; i02 < end; i02++) {
10322
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10323
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10324
 
10325
  float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10326
 
10327
+ ggml_float sumr = 0.0;
10328
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10329
  float v = x[i00] - mean;
10330
  y[i00] = v;
10331
+ sumr += (ggml_float)(v * v);
10332
  }
10333
+ sum2 += sumr;
10334
  }
10335
  }
10336
+ const float variance = sum2 / (ne00 * ne01 * step);
10337
  const float scale = 1.0f / sqrtf(variance + eps);
10338
 
10339
  for (int64_t i02 = start; i02 < end; i02++) {
 
13614
  }
13615
  }
13616
 
13617
+
13618
+ // ggml_compute_forward_arange
13619
+
13620
+ static void ggml_compute_forward_arange_f32(
13621
+ const struct ggml_compute_params * params,
13622
+ struct ggml_tensor * dst) {
13623
+
13624
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13625
+ return;
13626
+ }
13627
+
13628
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
13629
+
13630
+ const int ith = params->ith;
13631
+ const int nth = params->nth;
13632
+
13633
+ const float start = ggml_get_op_params_f32(dst, 0);
13634
+ const float stop = ggml_get_op_params_f32(dst, 1);
13635
+ const float step = ggml_get_op_params_f32(dst, 2);
13636
+
13637
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
13638
+
13639
+ GGML_ASSERT(ggml_nelements(dst) == steps);
13640
+
13641
+ for (int64_t i = ith; i < steps; i+= nth) {
13642
+ float value = start + step * i;
13643
+ ((float *)dst->data)[i] = value;
13644
+ }
13645
+ }
13646
+
13647
+ static void ggml_compute_forward_arange(
13648
+ const struct ggml_compute_params * params,
13649
+ struct ggml_tensor * dst) {
13650
+ switch (dst->type) {
13651
+ case GGML_TYPE_F32:
13652
+ {
13653
+ ggml_compute_forward_arange_f32(params, dst);
13654
+ } break;
13655
+ default:
13656
+ {
13657
+ GGML_ASSERT(false);
13658
+ } break;
13659
+ }
13660
+ }
13661
+
13662
+ static void ggml_compute_forward_timestep_embedding_f32(
13663
+ const struct ggml_compute_params * params,
13664
+ struct ggml_tensor * dst) {
13665
+
13666
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13667
+ return;
13668
+ }
13669
+
13670
+ const struct ggml_tensor * src0 = dst->src[0];
13671
+
13672
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13673
+
13674
+ const int ith = params->ith;
13675
+ const int nth = params->nth;
13676
+
13677
+ GGML_TENSOR_UNARY_OP_LOCALS
13678
+
13679
+ const int dim = ggml_get_op_params_i32(dst, 0);
13680
+ const int max_period = ggml_get_op_params_i32(dst, 1);
13681
+
13682
+ int half = dim / 2;
13683
+
13684
+ for (int64_t i = 0; i < ne00; i++) {
13685
+ float * embed_data = (float *)((char *) dst->data + i*nb1);
13686
+ for (int64_t j = ith; j < half; j += nth) {
13687
+ float timestep = ((float *)src0->data)[i];
13688
+ float freq = (float)expf(-logf(max_period) * j / half);
13689
+ float arg = timestep * freq;
13690
+ embed_data[j] = cosf(arg);
13691
+ embed_data[j + half] = sinf(arg);
13692
+ }
13693
+ if (dim % 2 != 0 && ith == 0) {
13694
+ embed_data[dim] = 0.f;
13695
+ }
13696
+ }
13697
+ }
13698
+
13699
+ static void ggml_compute_forward_timestep_embedding(
13700
+ const struct ggml_compute_params * params,
13701
+ struct ggml_tensor * dst) {
13702
+
13703
+ const struct ggml_tensor * src0 = dst->src[0];
13704
+
13705
+ switch (src0->type) {
13706
+ case GGML_TYPE_F32:
13707
+ {
13708
+ ggml_compute_forward_timestep_embedding_f32(params, dst);
13709
+ } break;
13710
+ default:
13711
+ {
13712
+ GGML_ASSERT(false);
13713
+ } break;
13714
+ }
13715
+ }
13716
+
13717
  // ggml_compute_forward_argsort
13718
 
13719
  static void ggml_compute_forward_argsort_f32(
 
15782
  {
15783
  ggml_compute_forward_pad(params, tensor);
15784
  } break;
15785
+ case GGML_OP_ARANGE:
15786
+ {
15787
+ ggml_compute_forward_arange(params, tensor);
15788
+ } break;
15789
+ case GGML_OP_TIMESTEP_EMBEDDING:
15790
+ {
15791
+ ggml_compute_forward_timestep_embedding(params, tensor);
15792
+ } break;
15793
  case GGML_OP_ARGSORT:
15794
  {
15795
  ggml_compute_forward_argsort(params, tensor);
 
16792
  {
16793
  GGML_ASSERT(false); // TODO: not implemented
16794
  } break;
16795
+ case GGML_OP_ARANGE:
16796
+ {
16797
+ GGML_ASSERT(false); // TODO: not implemented
16798
+ } break;
16799
+ case GGML_OP_TIMESTEP_EMBEDDING:
16800
+ {
16801
+ GGML_ASSERT(false); // TODO: not implemented
16802
+ } break;
16803
  case GGML_OP_ARGSORT:
16804
  {
16805
  GGML_ASSERT(false); // TODO: not implemented
 
17551
  {
17552
  n_tasks = n_threads;
17553
  } break;
17554
+ case GGML_OP_ARANGE:
17555
+ {
17556
+ n_tasks = n_threads;
17557
+ } break;
17558
+ case GGML_OP_TIMESTEP_EMBEDDING:
17559
+ {
17560
+ n_tasks = n_threads;
17561
+ } break;
17562
  case GGML_OP_ARGSORT:
17563
  {
17564
  n_tasks = n_threads;
ggml.h CHANGED
@@ -454,6 +454,8 @@ extern "C" {
454
  GGML_OP_POOL_2D,
455
  GGML_OP_UPSCALE, // nearest interpolate
456
  GGML_OP_PAD,
 
 
457
  GGML_OP_ARGSORT,
458
  GGML_OP_LEAKY_RELU,
459
 
@@ -1661,6 +1663,15 @@ extern "C" {
1661
  int p2,
1662
  int p3);
1663
 
 
 
 
 
 
 
 
 
 
1664
  // sort rows
1665
  enum ggml_sort_order {
1666
  GGML_SORT_ORDER_ASC,
@@ -1672,6 +1683,12 @@ extern "C" {
1672
  struct ggml_tensor * a,
1673
  enum ggml_sort_order order);
1674
 
 
 
 
 
 
 
1675
  // top k elements per row
1676
  GGML_API struct ggml_tensor * ggml_top_k(
1677
  struct ggml_context * ctx,
 
454
  GGML_OP_POOL_2D,
455
  GGML_OP_UPSCALE, // nearest interpolate
456
  GGML_OP_PAD,
457
+ GGML_OP_ARANGE,
458
+ GGML_OP_TIMESTEP_EMBEDDING,
459
  GGML_OP_ARGSORT,
460
  GGML_OP_LEAKY_RELU,
461
 
 
1663
  int p2,
1664
  int p3);
1665
 
1666
+ // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1667
+ // timesteps: [N,]
1668
+ // return: [N, dim]
1669
+ GGML_API struct ggml_tensor * ggml_timestep_embedding(
1670
+ struct ggml_context * ctx,
1671
+ struct ggml_tensor * timesteps,
1672
+ int dim,
1673
+ int max_period);
1674
+
1675
  // sort rows
1676
  enum ggml_sort_order {
1677
  GGML_SORT_ORDER_ASC,
 
1683
  struct ggml_tensor * a,
1684
  enum ggml_sort_order order);
1685
 
1686
+ GGML_API struct ggml_tensor * ggml_arange(
1687
+ struct ggml_context * ctx,
1688
+ float start,
1689
+ float stop,
1690
+ float step);
1691
+
1692
  // top k elements per row
1693
  GGML_API struct ggml_tensor * ggml_top_k(
1694
  struct ggml_context * ctx,