JidongZhang-THU slaren commited on
Commit
f17a416
·
unverified ·
1 Parent(s): 2645c33

llava : add MobileVLM support (llama/5132)

Browse files

* New Feature:
1. Sum_Rows:
fix cuda kernel overflow
fix block shape error when nrows too big
2. Im2Col:
Support Batch in cuda
Support f32 to f32 both in cpu && cuda
3. DepthWiseConv:
Support by Im2Col && MulMat
4. Pool_2d:
Supoort avg pooling in cuda
5. HardSigmoid:
Imp in cuda
6. HardSwish:
Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <[email protected]>

Files changed (3) hide show
  1. ggml-cuda.cu +192 -17
  2. ggml.c +102 -16
  3. ggml.h +2 -1
ggml-cuda.cu CHANGED
@@ -524,6 +524,8 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
524
  #define CUDA_SILU_BLOCK_SIZE 256
525
  #define CUDA_TANH_BLOCK_SIZE 256
526
  #define CUDA_RELU_BLOCK_SIZE 256
 
 
527
  #define CUDA_SQR_BLOCK_SIZE 256
528
  #define CUDA_CPY_BLOCK_SIZE 32
529
  #define CUDA_SCALE_BLOCK_SIZE 256
@@ -540,6 +542,7 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
540
  #define CUDA_PAD_BLOCK_SIZE 256
541
  #define CUDA_ACC_BLOCK_SIZE 256
542
  #define CUDA_IM2COL_BLOCK_SIZE 256
 
543
 
544
  #define CUDA_Q8_0_NE_ALIGN 2048
545
 
@@ -823,6 +826,24 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
823
  dst[i] = fmaxf(x[i], 0);
824
  }
825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826
  static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
827
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
828
  if (i >= k) {
@@ -5823,7 +5844,7 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
5823
  }
5824
 
5825
  static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
5826
- const int row = blockIdx.y;
5827
  const int col = threadIdx.x;
5828
 
5829
  float sum = 0.0f;
@@ -6145,9 +6166,10 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
6145
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
6146
  }
6147
 
6148
- static __global__ void im2col_f32_f16(
6149
- const float * x, half * dst,
6150
- int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
 
6151
  int s0, int s1, int p0, int p1, int d0, int d1) {
6152
  const int i = threadIdx.x + blockIdx.x * blockDim.x;
6153
  if (i >= pelements) {
@@ -6160,21 +6182,73 @@ static __global__ void im2col_f32_f16(
6160
  const int ky = (i - kd) / OW;
6161
  const int ix = i % OW;
6162
 
 
 
 
 
6163
  const int64_t iiw = ix * s0 + kx * d0 - p0;
6164
- const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
6165
 
6166
  const int64_t offset_dst =
6167
- (blockIdx.y * OW + ix) * CHW +
6168
- (blockIdx.z * (KW * KH) + ky * KW + kx);
6169
 
6170
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
6171
- dst[offset_dst] = __float2half(0.0f);
6172
  } else {
6173
- const int64_t offset_src = blockIdx.z * offset_delta;
6174
- dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
6175
  }
6176
  }
6177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6178
  template<int qk, int qr, dequantize_kernel_t dq>
6179
  static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6180
  const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
@@ -6388,6 +6462,16 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
6388
  relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
6389
  }
6390
 
 
 
 
 
 
 
 
 
 
 
6391
  static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
6392
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
6393
  leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
@@ -7475,7 +7559,7 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
7475
 
7476
  static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
7477
  const dim3 block_dims(WARP_SIZE, 1, 1);
7478
- const dim3 block_nums(1, nrows, 1);
7479
  k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7480
  }
7481
 
@@ -7587,14 +7671,15 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
7587
  }
7588
  }
7589
 
7590
- static void im2col_f32_f16_cuda(const float* x, half* dst,
 
7591
  int IW, int IH, int OW, int OH, int KW, int KH, int IC,
7592
- int offset_delta,
7593
  int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
7594
  const int parallel_elements = OW * KW * KH;
7595
  const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
7596
- dim3 block_nums(num_blocks, OH, IC);
7597
- im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
7598
  }
7599
 
7600
  // buffer pool for cuda
@@ -8179,6 +8264,34 @@ static void ggml_cuda_op_relu(
8179
  (void) src1_dd;
8180
  }
8181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8182
  static void ggml_cuda_op_leaky_relu(
8183
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8184
  const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
@@ -8810,13 +8923,46 @@ static void ggml_cuda_op_alibi(
8810
  (void) src1_dd;
8811
  }
8812
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8813
  static void ggml_cuda_op_im2col(
8814
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8815
  const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8816
 
8817
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8818
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
8819
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
8820
 
8821
  const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
8822
  const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@@ -8838,8 +8984,14 @@ static void ggml_cuda_op_im2col(
8838
  const int64_t OW = dst->ne[1];
8839
 
8840
  const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
 
 
8841
 
8842
- im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
 
 
 
 
8843
 
8844
  (void) src0;
8845
  (void) src0_dd;
@@ -9435,6 +9587,13 @@ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, g
9435
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
9436
  }
9437
 
 
 
 
 
 
 
 
9438
  static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9439
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
9440
  }
@@ -10220,6 +10379,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
10220
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
10221
  }
10222
 
 
 
 
 
10223
  static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10224
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
10225
  }
@@ -10321,6 +10484,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
10321
  case GGML_UNARY_OP_RELU:
10322
  func = ggml_cuda_relu;
10323
  break;
 
 
 
 
 
 
10324
  default:
10325
  return false;
10326
  }
@@ -10395,6 +10564,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
10395
  case GGML_OP_IM2COL:
10396
  func = ggml_cuda_im2col;
10397
  break;
 
 
 
10398
  case GGML_OP_SUM_ROWS:
10399
  func = ggml_cuda_sum_rows;
10400
  break;
@@ -11123,6 +11295,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11123
  case GGML_UNARY_OP_GELU:
11124
  case GGML_UNARY_OP_SILU:
11125
  case GGML_UNARY_OP_RELU:
 
 
11126
  case GGML_UNARY_OP_GELU_QUICK:
11127
  case GGML_UNARY_OP_TANH:
11128
  return true;
@@ -11221,6 +11395,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11221
  case GGML_OP_ROPE:
11222
  case GGML_OP_ALIBI:
11223
  case GGML_OP_IM2COL:
 
11224
  case GGML_OP_SUM_ROWS:
11225
  case GGML_OP_ARGSORT:
11226
  case GGML_OP_ACC:
 
524
  #define CUDA_SILU_BLOCK_SIZE 256
525
  #define CUDA_TANH_BLOCK_SIZE 256
526
  #define CUDA_RELU_BLOCK_SIZE 256
527
+ #define CUDA_HARDSIGMOID_BLOCK_SIZE 256
528
+ #define CUDA_HARDSWISH_BLOCK_SIZE 256
529
  #define CUDA_SQR_BLOCK_SIZE 256
530
  #define CUDA_CPY_BLOCK_SIZE 32
531
  #define CUDA_SCALE_BLOCK_SIZE 256
 
542
  #define CUDA_PAD_BLOCK_SIZE 256
543
  #define CUDA_ACC_BLOCK_SIZE 256
544
  #define CUDA_IM2COL_BLOCK_SIZE 256
545
+ #define CUDA_POOL2D_BLOCK_SIZE 256
546
 
547
  #define CUDA_Q8_0_NE_ALIGN 2048
548
 
 
826
  dst[i] = fmaxf(x[i], 0);
827
  }
828
 
829
+ static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
830
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
831
+
832
+ if (i >= k) {
833
+ return;
834
+ }
835
+ dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
836
+ }
837
+
838
+ static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
839
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
840
+
841
+ if (i >= k) {
842
+ return;
843
+ }
844
+ dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
845
+ }
846
+
847
  static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
848
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
849
  if (i >= k) {
 
5844
  }
5845
 
5846
  static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
5847
+ const int row = blockIdx.x;
5848
  const int col = threadIdx.x;
5849
 
5850
  float sum = 0.0f;
 
6166
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
6167
  }
6168
 
6169
+ template <typename T>
6170
+ static __global__ void im2col_kernel(
6171
+ const float * x, T * dst, int batch_offset,
6172
+ int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
6173
  int s0, int s1, int p0, int p1, int d0, int d1) {
6174
  const int i = threadIdx.x + blockIdx.x * blockDim.x;
6175
  if (i >= pelements) {
 
6182
  const int ky = (i - kd) / OW;
6183
  const int ix = i % OW;
6184
 
6185
+ const int oh = blockIdx.y;
6186
+ const int batch = blockIdx.z / IC;
6187
+ const int ic = blockIdx.z % IC;
6188
+
6189
  const int64_t iiw = ix * s0 + kx * d0 - p0;
6190
+ const int64_t iih = oh * s1 + ky * d1 - p1;
6191
 
6192
  const int64_t offset_dst =
6193
+ ((batch * OH + oh) * OW + ix) * CHW +
6194
+ (ic * (KW * KH) + ky * KW + kx);
6195
 
6196
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
6197
+ dst[offset_dst] = 0.0f;
6198
  } else {
6199
+ const int64_t offset_src = ic * offset_delta + batch * batch_offset;
6200
+ dst[offset_dst] = x[offset_src + iih * IW + iiw];
6201
  }
6202
  }
6203
 
6204
+ template <typename Ti, typename To>
6205
+ static __global__ void pool2d_nchw_kernel(
6206
+ const int ih, const int iw, const int oh, const int ow,
6207
+ const int kh, const int kw, const int sh, const int sw,
6208
+ const int ph, const int pw, const int parallel_elements,
6209
+ const Ti* src, To* dst, const enum ggml_op_pool op) {
6210
+ int idx = threadIdx.x + blockIdx.x * blockDim.x;
6211
+ if (idx >= parallel_elements) {
6212
+ return;
6213
+ }
6214
+
6215
+ const int I_HW = ih * iw;
6216
+ const int O_HW = oh * ow;
6217
+ const int nc = idx / O_HW;
6218
+ const int cur_oh = idx % O_HW / ow;
6219
+ const int cur_ow = idx % O_HW % ow;
6220
+ const Ti* i_ptr = src + nc * I_HW;
6221
+ To* o_ptr = dst + nc * O_HW;
6222
+ const int start_h = cur_oh * sh - ph;
6223
+ const int bh = max(0, start_h);
6224
+ const int eh = min(ih, start_h + kh);
6225
+ const int start_w = cur_ow * sw - pw;
6226
+ const int bw = max(0, start_w);
6227
+ const int ew = min(iw, start_w + kw);
6228
+ const To scale = 1. / (kh * kw);
6229
+ To res = 0;
6230
+
6231
+ switch (op) {
6232
+ case GGML_OP_POOL_AVG: res = 0; break;
6233
+ case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
6234
+ }
6235
+
6236
+ for (int i = bh; i < eh; i += 1) {
6237
+ for (int j = bw; j < ew; j += 1) {
6238
+ #if __CUDA_ARCH__ >= 350
6239
+ Ti cur = __ldg(i_ptr + i * iw + j);
6240
+ #else
6241
+ Ti cur = i_ptr[i * iw + j];
6242
+ #endif
6243
+ switch (op) {
6244
+ case GGML_OP_POOL_AVG: res += cur * scale; break;
6245
+ case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
6246
+ }
6247
+ }
6248
+ }
6249
+ o_ptr[cur_oh * ow + cur_ow] = res;
6250
+ }
6251
+
6252
  template<int qk, int qr, dequantize_kernel_t dq>
6253
  static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6254
  const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
 
6462
  relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
6463
  }
6464
 
6465
+ static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
6466
+ const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
6467
+ hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
6468
+ }
6469
+
6470
+ static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
6471
+ const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
6472
+ hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
6473
+ }
6474
+
6475
  static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
6476
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
6477
  leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
 
7559
 
7560
  static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
7561
  const dim3 block_dims(WARP_SIZE, 1, 1);
7562
+ const dim3 block_nums(nrows, 1, 1);
7563
  k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7564
  }
7565
 
 
7671
  }
7672
  }
7673
 
7674
+ template <typename T>
7675
+ static void im2col_cuda(const float* x, T* dst,
7676
  int IW, int IH, int OW, int OH, int KW, int KH, int IC,
7677
+ int batch, int batch_offset, int offset_delta,
7678
  int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
7679
  const int parallel_elements = OW * KW * KH;
7680
  const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
7681
+ dim3 block_nums(num_blocks, OH, batch * IC);
7682
+ im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
7683
  }
7684
 
7685
  // buffer pool for cuda
 
8264
  (void) src1_dd;
8265
  }
8266
 
8267
+ static void ggml_cuda_op_hardsigmoid(
8268
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8269
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8270
+
8271
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
8272
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
8273
+
8274
+ hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
8275
+
8276
+ (void) src1;
8277
+ (void) dst;
8278
+ (void) src1_dd;
8279
+ }
8280
+
8281
+ static void ggml_cuda_op_hardswish(
8282
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8283
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8284
+
8285
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
8286
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
8287
+
8288
+ hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
8289
+
8290
+ (void) src1;
8291
+ (void) dst;
8292
+ (void) src1_dd;
8293
+ }
8294
+
8295
  static void ggml_cuda_op_leaky_relu(
8296
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8297
  const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 
8923
  (void) src1_dd;
8924
  }
8925
 
8926
+ static void ggml_cuda_op_pool2d(
8927
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8928
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8929
+
8930
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
8931
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
8932
+
8933
+ const int32_t * opts = (const int32_t *)dst->op_params;
8934
+ enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
8935
+ const int k0 = opts[1];
8936
+ const int k1 = opts[2];
8937
+ const int s0 = opts[3];
8938
+ const int s1 = opts[4];
8939
+ const int p0 = opts[5];
8940
+ const int p1 = opts[6];
8941
+
8942
+ const int64_t IH = src0->ne[1];
8943
+ const int64_t IW = src0->ne[0];
8944
+
8945
+ const int64_t N = dst->ne[3];
8946
+ const int64_t OC = dst->ne[2];
8947
+ const int64_t OH = dst->ne[1];
8948
+ const int64_t OW = dst->ne[0];
8949
+
8950
+ const int parallel_elements = N * OC * OH * OW;
8951
+ const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
8952
+ dim3 block_nums(num_blocks);
8953
+ pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);
8954
+
8955
+ (void) src1;
8956
+ (void) src1_dd;
8957
+ }
8958
+
8959
  static void ggml_cuda_op_im2col(
8960
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8961
  const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8962
 
8963
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8964
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
8965
+ GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
8966
 
8967
  const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
8968
  const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
 
8984
  const int64_t OW = dst->ne[1];
8985
 
8986
  const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
8987
+ const int64_t batch = src1->ne[3];
8988
+ const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
8989
 
8990
+ if(dst->type == GGML_TYPE_F16) {
8991
+ im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
8992
+ } else {
8993
+ im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
8994
+ }
8995
 
8996
  (void) src0;
8997
  (void) src0_dd;
 
9587
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
9588
  }
9589
 
9590
+ static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9591
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid);
9592
+ }
9593
+
9594
+ static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9595
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish);
9596
+ }
9597
  static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9598
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
9599
  }
 
10379
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
10380
  }
10381
 
10382
+ static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10383
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d);
10384
+ }
10385
+
10386
  static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10387
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
10388
  }
 
10484
  case GGML_UNARY_OP_RELU:
10485
  func = ggml_cuda_relu;
10486
  break;
10487
+ case GGML_UNARY_OP_HARDSIGMOID:
10488
+ func = ggml_cuda_hardsigmoid;
10489
+ break;
10490
+ case GGML_UNARY_OP_HARDSWISH:
10491
+ func = ggml_cuda_hardswish;
10492
+ break;
10493
  default:
10494
  return false;
10495
  }
 
10564
  case GGML_OP_IM2COL:
10565
  func = ggml_cuda_im2col;
10566
  break;
10567
+ case GGML_OP_POOL_2D:
10568
+ func = ggml_cuda_pool2d;
10569
+ break;
10570
  case GGML_OP_SUM_ROWS:
10571
  func = ggml_cuda_sum_rows;
10572
  break;
 
11295
  case GGML_UNARY_OP_GELU:
11296
  case GGML_UNARY_OP_SILU:
11297
  case GGML_UNARY_OP_RELU:
11298
+ case GGML_UNARY_OP_HARDSIGMOID:
11299
+ case GGML_UNARY_OP_HARDSWISH:
11300
  case GGML_UNARY_OP_GELU_QUICK:
11301
  case GGML_UNARY_OP_TANH:
11302
  return true;
 
11395
  case GGML_OP_ROPE:
11396
  case GGML_OP_ALIBI:
11397
  case GGML_OP_IM2COL:
11398
+ case GGML_OP_POOL_2D:
11399
  case GGML_OP_SUM_ROWS:
11400
  case GGML_OP_ARGSORT:
11401
  case GGML_OP_ACC:
ggml.c CHANGED
@@ -5349,7 +5349,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5349
  int s0,
5350
  int p0,
5351
  int d0) {
5352
- struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5353
 
5354
  struct ggml_tensor * result =
5355
  ggml_mul_mat(ctx,
@@ -5427,16 +5427,15 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
5427
  int p1,
5428
  int d0,
5429
  int d1) {
 
5430
  struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
5431
  struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
5432
  ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
5433
- s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
5434
-
5435
- struct ggml_tensor * result =
5436
- ggml_mul_mat(ctx,
5437
- ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
5438
- ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
5439
 
 
 
5440
  result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
5441
 
5442
  return result;
@@ -5457,7 +5456,8 @@ struct ggml_tensor * ggml_im2col(
5457
  int p1,
5458
  int d0,
5459
  int d1,
5460
- bool is_2D) {
 
5461
 
5462
  if(is_2D) {
5463
  GGML_ASSERT(a->ne[2] == b->ne[2]);
@@ -5481,7 +5481,7 @@ struct ggml_tensor * ggml_im2col(
5481
  is_2D ? b->ne[3] : 1,
5482
  };
5483
 
5484
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5485
  int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5486
  ggml_set_op_params(result, params, sizeof(params));
5487
 
@@ -5506,7 +5506,7 @@ struct ggml_tensor * ggml_conv_2d(
5506
  int p1,
5507
  int d0,
5508
  int d1) {
5509
- struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5510
 
5511
  struct ggml_tensor * result =
5512
  ggml_mul_mat(ctx,
@@ -5632,12 +5632,13 @@ struct ggml_tensor * ggml_pool_2d(
5632
  is_node = true;
5633
  }
5634
 
 
5635
  const int64_t ne[3] = {
5636
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5637
  ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
5638
  a->ne[2],
5639
  };
5640
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
5641
 
5642
  int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
5643
  ggml_set_op_params(result, params, sizeof(params));
@@ -5645,7 +5646,6 @@ struct ggml_tensor * ggml_pool_2d(
5645
  result->op = GGML_OP_POOL_2D;
5646
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5647
  result->src[0] = a;
5648
-
5649
  return result;
5650
  }
5651
 
@@ -12493,6 +12493,92 @@ static void ggml_compute_forward_conv_transpose_1d(
12493
  }
12494
  }
12495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12496
  // src0: kernel [OC, IC, KH, KW]
12497
  // src1: image [N, IC, IH, IW]
12498
  // dst: result [N, OH, OW, IC*KH*KW]
@@ -12583,14 +12669,14 @@ static void ggml_compute_forward_im2col(
12583
  const struct ggml_tensor * src0,
12584
  const struct ggml_tensor * src1,
12585
  struct ggml_tensor * dst) {
12586
- switch (src0->type) {
12587
  case GGML_TYPE_F16:
12588
  {
12589
  ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12590
  } break;
12591
  case GGML_TYPE_F32:
12592
  {
12593
- GGML_ASSERT(false);
12594
  } break;
12595
  default:
12596
  {
@@ -12781,8 +12867,8 @@ static void ggml_compute_forward_pool_2d(
12781
  const struct ggml_compute_params * params,
12782
  const struct ggml_tensor * src,
12783
  struct ggml_tensor * dst) {
12784
- assert(src->type == GGML_TYPE_F32);
12785
- assert(params->ith == 0);
12786
 
12787
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12788
  return;
 
5349
  int s0,
5350
  int p0,
5351
  int d0) {
5352
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
5353
 
5354
  struct ggml_tensor * result =
5355
  ggml_mul_mat(ctx,
 
5427
  int p1,
5428
  int d0,
5429
  int d1) {
5430
+
5431
  struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
5432
  struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
5433
  ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
5434
+ s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
5435
+ struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
 
 
 
 
5436
 
5437
+ new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
5438
+ struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
5439
  result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
5440
 
5441
  return result;
 
5456
  int p1,
5457
  int d0,
5458
  int d1,
5459
+ bool is_2D,
5460
+ enum ggml_type dst_type) {
5461
 
5462
  if(is_2D) {
5463
  GGML_ASSERT(a->ne[2] == b->ne[2]);
 
5481
  is_2D ? b->ne[3] : 1,
5482
  };
5483
 
5484
+ struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
5485
  int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5486
  ggml_set_op_params(result, params, sizeof(params));
5487
 
 
5506
  int p1,
5507
  int d0,
5508
  int d1) {
5509
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
5510
 
5511
  struct ggml_tensor * result =
5512
  ggml_mul_mat(ctx,
 
5632
  is_node = true;
5633
  }
5634
 
5635
+ struct ggml_tensor * result;
5636
  const int64_t ne[3] = {
5637
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5638
  ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
5639
  a->ne[2],
5640
  };
5641
+ result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
5642
 
5643
  int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
5644
  ggml_set_op_params(result, params, sizeof(params));
 
5646
  result->op = GGML_OP_POOL_2D;
5647
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5648
  result->src[0] = a;
 
5649
  return result;
5650
  }
5651
 
 
12493
  }
12494
  }
12495
 
12496
+ // src0: kernel [OC, IC, KH, KW]
12497
+ // src1: image [N, IC, IH, IW]
12498
+ // dst: result [N, OH, OW, IC*KH*KW]
12499
+ static void ggml_compute_forward_im2col_f32(
12500
+ const struct ggml_compute_params * params,
12501
+ const struct ggml_tensor * src0,
12502
+ const struct ggml_tensor * src1,
12503
+ struct ggml_tensor * dst) {
12504
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
12505
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12506
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
12507
+
12508
+ int64_t t0 = ggml_perf_time_us();
12509
+ UNUSED(t0);
12510
+
12511
+ GGML_TENSOR_BINARY_OP_LOCALS;
12512
+
12513
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
12514
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
12515
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
12516
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
12517
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
12518
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
12519
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
12520
+
12521
+ const int ith = params->ith;
12522
+ const int nth = params->nth;
12523
+
12524
+ const int64_t N = is_2D ? ne13 : ne12;
12525
+ const int64_t IC = is_2D ? ne12 : ne11;
12526
+ const int64_t IH = is_2D ? ne11 : 1;
12527
+ const int64_t IW = ne10;
12528
+
12529
+ const int64_t KH = is_2D ? ne01 : 1;
12530
+ const int64_t KW = ne00;
12531
+
12532
+ const int64_t OH = is_2D ? ne2 : 1;
12533
+ const int64_t OW = ne1;
12534
+
12535
+ int ofs0 = is_2D ? nb13 : nb12;
12536
+ int ofs1 = is_2D ? nb12 : nb11;
12537
+
12538
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12539
+ GGML_ASSERT(nb10 == sizeof(float));
12540
+
12541
+ if (params->type == GGML_TASK_INIT) {
12542
+ return;
12543
+ }
12544
+
12545
+ if (params->type == GGML_TASK_FINALIZE) {
12546
+ return;
12547
+ }
12548
+
12549
+ // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
12550
+ {
12551
+ float * const wdata = (float *) dst->data;
12552
+
12553
+ for (int64_t in = 0; in < N; in++) {
12554
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12555
+ for (int64_t iow = 0; iow < OW; iow++) {
12556
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12557
+
12558
+ // micro kernel
12559
+ float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12560
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12561
+
12562
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12563
+ for (int64_t ikw = 0; ikw < KW; ikw++) {
12564
+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
12565
+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
12566
+
12567
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
12568
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
12569
+ } else {
12570
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
12571
+ }
12572
+ }
12573
+ }
12574
+ }
12575
+ }
12576
+ }
12577
+ }
12578
+ }
12579
+ }
12580
+
12581
+
12582
  // src0: kernel [OC, IC, KH, KW]
12583
  // src1: image [N, IC, IH, IW]
12584
  // dst: result [N, OH, OW, IC*KH*KW]
 
12669
  const struct ggml_tensor * src0,
12670
  const struct ggml_tensor * src1,
12671
  struct ggml_tensor * dst) {
12672
+ switch (dst->type) {
12673
  case GGML_TYPE_F16:
12674
  {
12675
  ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12676
  } break;
12677
  case GGML_TYPE_F32:
12678
  {
12679
+ ggml_compute_forward_im2col_f32(params, src0, src1, dst);
12680
  } break;
12681
  default:
12682
  {
 
12867
  const struct ggml_compute_params * params,
12868
  const struct ggml_tensor * src,
12869
  struct ggml_tensor * dst) {
12870
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
12871
+ GGML_ASSERT(params->ith == 0);
12872
 
12873
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12874
  return;
ggml.h CHANGED
@@ -1500,7 +1500,8 @@ extern "C" {
1500
  int p1,
1501
  int d0,
1502
  int d1,
1503
- bool is_2D);
 
1504
 
1505
  GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
1506
  struct ggml_context * ctx,
 
1500
  int p1,
1501
  int d0,
1502
  int d1,
1503
+ bool is_2D,
1504
+ enum ggml_type dst_type);
1505
 
1506
  GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
1507
  struct ggml_context * ctx,