Spaces:
Sleeping
llava : add MobileVLM support (llama/5132)
Browse files* New Feature:
1. Sum_Rows:
fix cuda kernel overflow
fix block shape error when nrows too big
2. Im2Col:
Support Batch in cuda
Support f32 to f32 both in cpu && cuda
3. DepthWiseConv:
Support by Im2Col && MulMat
4. Pool_2d:
Supoort avg pooling in cuda
5. HardSigmoid:
Imp in cuda
6. HardSwish:
Imp in cuda
* fix tabs instead of spaces
* code clean
* CUDA POOL2D
* ADD POOL2D test case in test-backend-ops.cpp
* code clean
* fix pool2d_kernel
nits
* fix bug in pool2d kernel
* fix avg pooling, count_include_pad
nits
* test-backend-ops : add more pool_2d tests
* cuda : fix warnings and formatting
* ggml : check types in release builds too in pool_2d
* test-backend-ops : remove f16 pool_2d tests
* cuda : more style fixes
* Add assert in ggml_cuda_op_pool2d
* pool2d float padding fallback
* test-backend-ops : add dst_type to im2col
---------
Co-authored-by: slaren <[email protected]>
- ggml-cuda.cu +192 -17
- ggml.c +102 -16
- ggml.h +2 -1
|
@@ -524,6 +524,8 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
|
|
| 524 |
#define CUDA_SILU_BLOCK_SIZE 256
|
| 525 |
#define CUDA_TANH_BLOCK_SIZE 256
|
| 526 |
#define CUDA_RELU_BLOCK_SIZE 256
|
|
|
|
|
|
|
| 527 |
#define CUDA_SQR_BLOCK_SIZE 256
|
| 528 |
#define CUDA_CPY_BLOCK_SIZE 32
|
| 529 |
#define CUDA_SCALE_BLOCK_SIZE 256
|
|
@@ -540,6 +542,7 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
|
|
| 540 |
#define CUDA_PAD_BLOCK_SIZE 256
|
| 541 |
#define CUDA_ACC_BLOCK_SIZE 256
|
| 542 |
#define CUDA_IM2COL_BLOCK_SIZE 256
|
|
|
|
| 543 |
|
| 544 |
#define CUDA_Q8_0_NE_ALIGN 2048
|
| 545 |
|
|
@@ -823,6 +826,24 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|
| 823 |
dst[i] = fmaxf(x[i], 0);
|
| 824 |
}
|
| 825 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 826 |
static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
|
| 827 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 828 |
if (i >= k) {
|
|
@@ -5823,7 +5844,7 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
|
| 5823 |
}
|
| 5824 |
|
| 5825 |
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
| 5826 |
-
const int row = blockIdx.
|
| 5827 |
const int col = threadIdx.x;
|
| 5828 |
|
| 5829 |
float sum = 0.0f;
|
|
@@ -6145,9 +6166,10 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
| 6145 |
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
| 6146 |
}
|
| 6147 |
|
| 6148 |
-
|
| 6149 |
-
|
| 6150 |
-
|
|
|
|
| 6151 |
int s0, int s1, int p0, int p1, int d0, int d1) {
|
| 6152 |
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
| 6153 |
if (i >= pelements) {
|
|
@@ -6160,21 +6182,73 @@ static __global__ void im2col_f32_f16(
|
|
| 6160 |
const int ky = (i - kd) / OW;
|
| 6161 |
const int ix = i % OW;
|
| 6162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6163 |
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
| 6164 |
-
const int64_t iih =
|
| 6165 |
|
| 6166 |
const int64_t offset_dst =
|
| 6167 |
-
(
|
| 6168 |
-
(
|
| 6169 |
|
| 6170 |
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
| 6171 |
-
dst[offset_dst] =
|
| 6172 |
} else {
|
| 6173 |
-
const int64_t offset_src =
|
| 6174 |
-
dst[offset_dst] =
|
| 6175 |
}
|
| 6176 |
}
|
| 6177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6178 |
template<int qk, int qr, dequantize_kernel_t dq>
|
| 6179 |
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6180 |
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
|
@@ -6388,6 +6462,16 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
| 6388 |
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
| 6389 |
}
|
| 6390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6391 |
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
| 6392 |
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
| 6393 |
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
|
@@ -7475,7 +7559,7 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
|
| 7475 |
|
| 7476 |
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 7477 |
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 7478 |
-
const dim3 block_nums(
|
| 7479 |
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
| 7480 |
}
|
| 7481 |
|
|
@@ -7587,14 +7671,15 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
|
|
| 7587 |
}
|
| 7588 |
}
|
| 7589 |
|
| 7590 |
-
|
|
|
|
| 7591 |
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
| 7592 |
-
int offset_delta,
|
| 7593 |
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
| 7594 |
const int parallel_elements = OW * KW * KH;
|
| 7595 |
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
| 7596 |
-
dim3 block_nums(num_blocks, OH, IC);
|
| 7597 |
-
|
| 7598 |
}
|
| 7599 |
|
| 7600 |
// buffer pool for cuda
|
|
@@ -8179,6 +8264,34 @@ static void ggml_cuda_op_relu(
|
|
| 8179 |
(void) src1_dd;
|
| 8180 |
}
|
| 8181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8182 |
static void ggml_cuda_op_leaky_relu(
|
| 8183 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 8184 |
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
|
@@ -8810,13 +8923,46 @@ static void ggml_cuda_op_alibi(
|
|
| 8810 |
(void) src1_dd;
|
| 8811 |
}
|
| 8812 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8813 |
static void ggml_cuda_op_im2col(
|
| 8814 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 8815 |
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 8816 |
|
| 8817 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 8818 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 8819 |
-
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
| 8820 |
|
| 8821 |
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
| 8822 |
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
|
@@ -8838,8 +8984,14 @@ static void ggml_cuda_op_im2col(
|
|
| 8838 |
const int64_t OW = dst->ne[1];
|
| 8839 |
|
| 8840 |
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
|
|
|
|
|
|
| 8841 |
|
| 8842 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8843 |
|
| 8844 |
(void) src0;
|
| 8845 |
(void) src0_dd;
|
|
@@ -9435,6 +9587,13 @@ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
| 9435 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
| 9436 |
}
|
| 9437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9438 |
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 9439 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
|
| 9440 |
}
|
|
@@ -10220,6 +10379,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
| 10220 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
| 10221 |
}
|
| 10222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10223 |
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 10224 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
| 10225 |
}
|
|
@@ -10321,6 +10484,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
| 10321 |
case GGML_UNARY_OP_RELU:
|
| 10322 |
func = ggml_cuda_relu;
|
| 10323 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10324 |
default:
|
| 10325 |
return false;
|
| 10326 |
}
|
|
@@ -10395,6 +10564,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
| 10395 |
case GGML_OP_IM2COL:
|
| 10396 |
func = ggml_cuda_im2col;
|
| 10397 |
break;
|
|
|
|
|
|
|
|
|
|
| 10398 |
case GGML_OP_SUM_ROWS:
|
| 10399 |
func = ggml_cuda_sum_rows;
|
| 10400 |
break;
|
|
@@ -11123,6 +11295,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 11123 |
case GGML_UNARY_OP_GELU:
|
| 11124 |
case GGML_UNARY_OP_SILU:
|
| 11125 |
case GGML_UNARY_OP_RELU:
|
|
|
|
|
|
|
| 11126 |
case GGML_UNARY_OP_GELU_QUICK:
|
| 11127 |
case GGML_UNARY_OP_TANH:
|
| 11128 |
return true;
|
|
@@ -11221,6 +11395,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 11221 |
case GGML_OP_ROPE:
|
| 11222 |
case GGML_OP_ALIBI:
|
| 11223 |
case GGML_OP_IM2COL:
|
|
|
|
| 11224 |
case GGML_OP_SUM_ROWS:
|
| 11225 |
case GGML_OP_ARGSORT:
|
| 11226 |
case GGML_OP_ACC:
|
|
|
|
| 524 |
#define CUDA_SILU_BLOCK_SIZE 256
|
| 525 |
#define CUDA_TANH_BLOCK_SIZE 256
|
| 526 |
#define CUDA_RELU_BLOCK_SIZE 256
|
| 527 |
+
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
|
| 528 |
+
#define CUDA_HARDSWISH_BLOCK_SIZE 256
|
| 529 |
#define CUDA_SQR_BLOCK_SIZE 256
|
| 530 |
#define CUDA_CPY_BLOCK_SIZE 32
|
| 531 |
#define CUDA_SCALE_BLOCK_SIZE 256
|
|
|
|
| 542 |
#define CUDA_PAD_BLOCK_SIZE 256
|
| 543 |
#define CUDA_ACC_BLOCK_SIZE 256
|
| 544 |
#define CUDA_IM2COL_BLOCK_SIZE 256
|
| 545 |
+
#define CUDA_POOL2D_BLOCK_SIZE 256
|
| 546 |
|
| 547 |
#define CUDA_Q8_0_NE_ALIGN 2048
|
| 548 |
|
|
|
|
| 826 |
dst[i] = fmaxf(x[i], 0);
|
| 827 |
}
|
| 828 |
|
| 829 |
+
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
|
| 830 |
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 831 |
+
|
| 832 |
+
if (i >= k) {
|
| 833 |
+
return;
|
| 834 |
+
}
|
| 835 |
+
dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
| 836 |
+
}
|
| 837 |
+
|
| 838 |
+
static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
|
| 839 |
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 840 |
+
|
| 841 |
+
if (i >= k) {
|
| 842 |
+
return;
|
| 843 |
+
}
|
| 844 |
+
dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
| 845 |
+
}
|
| 846 |
+
|
| 847 |
static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
|
| 848 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 849 |
if (i >= k) {
|
|
|
|
| 5844 |
}
|
| 5845 |
|
| 5846 |
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
| 5847 |
+
const int row = blockIdx.x;
|
| 5848 |
const int col = threadIdx.x;
|
| 5849 |
|
| 5850 |
float sum = 0.0f;
|
|
|
|
| 6166 |
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
| 6167 |
}
|
| 6168 |
|
| 6169 |
+
template <typename T>
|
| 6170 |
+
static __global__ void im2col_kernel(
|
| 6171 |
+
const float * x, T * dst, int batch_offset,
|
| 6172 |
+
int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
|
| 6173 |
int s0, int s1, int p0, int p1, int d0, int d1) {
|
| 6174 |
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
| 6175 |
if (i >= pelements) {
|
|
|
|
| 6182 |
const int ky = (i - kd) / OW;
|
| 6183 |
const int ix = i % OW;
|
| 6184 |
|
| 6185 |
+
const int oh = blockIdx.y;
|
| 6186 |
+
const int batch = blockIdx.z / IC;
|
| 6187 |
+
const int ic = blockIdx.z % IC;
|
| 6188 |
+
|
| 6189 |
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
| 6190 |
+
const int64_t iih = oh * s1 + ky * d1 - p1;
|
| 6191 |
|
| 6192 |
const int64_t offset_dst =
|
| 6193 |
+
((batch * OH + oh) * OW + ix) * CHW +
|
| 6194 |
+
(ic * (KW * KH) + ky * KW + kx);
|
| 6195 |
|
| 6196 |
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
| 6197 |
+
dst[offset_dst] = 0.0f;
|
| 6198 |
} else {
|
| 6199 |
+
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
|
| 6200 |
+
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
| 6201 |
}
|
| 6202 |
}
|
| 6203 |
|
| 6204 |
+
template <typename Ti, typename To>
|
| 6205 |
+
static __global__ void pool2d_nchw_kernel(
|
| 6206 |
+
const int ih, const int iw, const int oh, const int ow,
|
| 6207 |
+
const int kh, const int kw, const int sh, const int sw,
|
| 6208 |
+
const int ph, const int pw, const int parallel_elements,
|
| 6209 |
+
const Ti* src, To* dst, const enum ggml_op_pool op) {
|
| 6210 |
+
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 6211 |
+
if (idx >= parallel_elements) {
|
| 6212 |
+
return;
|
| 6213 |
+
}
|
| 6214 |
+
|
| 6215 |
+
const int I_HW = ih * iw;
|
| 6216 |
+
const int O_HW = oh * ow;
|
| 6217 |
+
const int nc = idx / O_HW;
|
| 6218 |
+
const int cur_oh = idx % O_HW / ow;
|
| 6219 |
+
const int cur_ow = idx % O_HW % ow;
|
| 6220 |
+
const Ti* i_ptr = src + nc * I_HW;
|
| 6221 |
+
To* o_ptr = dst + nc * O_HW;
|
| 6222 |
+
const int start_h = cur_oh * sh - ph;
|
| 6223 |
+
const int bh = max(0, start_h);
|
| 6224 |
+
const int eh = min(ih, start_h + kh);
|
| 6225 |
+
const int start_w = cur_ow * sw - pw;
|
| 6226 |
+
const int bw = max(0, start_w);
|
| 6227 |
+
const int ew = min(iw, start_w + kw);
|
| 6228 |
+
const To scale = 1. / (kh * kw);
|
| 6229 |
+
To res = 0;
|
| 6230 |
+
|
| 6231 |
+
switch (op) {
|
| 6232 |
+
case GGML_OP_POOL_AVG: res = 0; break;
|
| 6233 |
+
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
| 6234 |
+
}
|
| 6235 |
+
|
| 6236 |
+
for (int i = bh; i < eh; i += 1) {
|
| 6237 |
+
for (int j = bw; j < ew; j += 1) {
|
| 6238 |
+
#if __CUDA_ARCH__ >= 350
|
| 6239 |
+
Ti cur = __ldg(i_ptr + i * iw + j);
|
| 6240 |
+
#else
|
| 6241 |
+
Ti cur = i_ptr[i * iw + j];
|
| 6242 |
+
#endif
|
| 6243 |
+
switch (op) {
|
| 6244 |
+
case GGML_OP_POOL_AVG: res += cur * scale; break;
|
| 6245 |
+
case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
|
| 6246 |
+
}
|
| 6247 |
+
}
|
| 6248 |
+
}
|
| 6249 |
+
o_ptr[cur_oh * ow + cur_ow] = res;
|
| 6250 |
+
}
|
| 6251 |
+
|
| 6252 |
template<int qk, int qr, dequantize_kernel_t dq>
|
| 6253 |
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 6254 |
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
|
|
|
| 6462 |
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
| 6463 |
}
|
| 6464 |
|
| 6465 |
+
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
| 6466 |
+
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
|
| 6467 |
+
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
| 6468 |
+
}
|
| 6469 |
+
|
| 6470 |
+
static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
| 6471 |
+
const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
|
| 6472 |
+
hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
| 6473 |
+
}
|
| 6474 |
+
|
| 6475 |
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
| 6476 |
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
| 6477 |
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
|
|
|
| 7559 |
|
| 7560 |
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 7561 |
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 7562 |
+
const dim3 block_nums(nrows, 1, 1);
|
| 7563 |
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
| 7564 |
}
|
| 7565 |
|
|
|
|
| 7671 |
}
|
| 7672 |
}
|
| 7673 |
|
| 7674 |
+
template <typename T>
|
| 7675 |
+
static void im2col_cuda(const float* x, T* dst,
|
| 7676 |
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
| 7677 |
+
int batch, int batch_offset, int offset_delta,
|
| 7678 |
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
| 7679 |
const int parallel_elements = OW * KW * KH;
|
| 7680 |
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
| 7681 |
+
dim3 block_nums(num_blocks, OH, batch * IC);
|
| 7682 |
+
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
| 7683 |
}
|
| 7684 |
|
| 7685 |
// buffer pool for cuda
|
|
|
|
| 8264 |
(void) src1_dd;
|
| 8265 |
}
|
| 8266 |
|
| 8267 |
+
static void ggml_cuda_op_hardsigmoid(
|
| 8268 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 8269 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 8270 |
+
|
| 8271 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 8272 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 8273 |
+
|
| 8274 |
+
hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 8275 |
+
|
| 8276 |
+
(void) src1;
|
| 8277 |
+
(void) dst;
|
| 8278 |
+
(void) src1_dd;
|
| 8279 |
+
}
|
| 8280 |
+
|
| 8281 |
+
static void ggml_cuda_op_hardswish(
|
| 8282 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 8283 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 8284 |
+
|
| 8285 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 8286 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 8287 |
+
|
| 8288 |
+
hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 8289 |
+
|
| 8290 |
+
(void) src1;
|
| 8291 |
+
(void) dst;
|
| 8292 |
+
(void) src1_dd;
|
| 8293 |
+
}
|
| 8294 |
+
|
| 8295 |
static void ggml_cuda_op_leaky_relu(
|
| 8296 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 8297 |
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
|
|
|
| 8923 |
(void) src1_dd;
|
| 8924 |
}
|
| 8925 |
|
| 8926 |
+
static void ggml_cuda_op_pool2d(
|
| 8927 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 8928 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 8929 |
+
|
| 8930 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 8931 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 8932 |
+
|
| 8933 |
+
const int32_t * opts = (const int32_t *)dst->op_params;
|
| 8934 |
+
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
| 8935 |
+
const int k0 = opts[1];
|
| 8936 |
+
const int k1 = opts[2];
|
| 8937 |
+
const int s0 = opts[3];
|
| 8938 |
+
const int s1 = opts[4];
|
| 8939 |
+
const int p0 = opts[5];
|
| 8940 |
+
const int p1 = opts[6];
|
| 8941 |
+
|
| 8942 |
+
const int64_t IH = src0->ne[1];
|
| 8943 |
+
const int64_t IW = src0->ne[0];
|
| 8944 |
+
|
| 8945 |
+
const int64_t N = dst->ne[3];
|
| 8946 |
+
const int64_t OC = dst->ne[2];
|
| 8947 |
+
const int64_t OH = dst->ne[1];
|
| 8948 |
+
const int64_t OW = dst->ne[0];
|
| 8949 |
+
|
| 8950 |
+
const int parallel_elements = N * OC * OH * OW;
|
| 8951 |
+
const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
|
| 8952 |
+
dim3 block_nums(num_blocks);
|
| 8953 |
+
pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);
|
| 8954 |
+
|
| 8955 |
+
(void) src1;
|
| 8956 |
+
(void) src1_dd;
|
| 8957 |
+
}
|
| 8958 |
+
|
| 8959 |
static void ggml_cuda_op_im2col(
|
| 8960 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 8961 |
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 8962 |
|
| 8963 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 8964 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 8965 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
| 8966 |
|
| 8967 |
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
| 8968 |
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
|
|
|
| 8984 |
const int64_t OW = dst->ne[1];
|
| 8985 |
|
| 8986 |
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
| 8987 |
+
const int64_t batch = src1->ne[3];
|
| 8988 |
+
const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
|
| 8989 |
|
| 8990 |
+
if(dst->type == GGML_TYPE_F16) {
|
| 8991 |
+
im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
| 8992 |
+
} else {
|
| 8993 |
+
im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
| 8994 |
+
}
|
| 8995 |
|
| 8996 |
(void) src0;
|
| 8997 |
(void) src0_dd;
|
|
|
|
| 9587 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
| 9588 |
}
|
| 9589 |
|
| 9590 |
+
static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 9591 |
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid);
|
| 9592 |
+
}
|
| 9593 |
+
|
| 9594 |
+
static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 9595 |
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish);
|
| 9596 |
+
}
|
| 9597 |
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 9598 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
|
| 9599 |
}
|
|
|
|
| 10379 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
| 10380 |
}
|
| 10381 |
|
| 10382 |
+
static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 10383 |
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d);
|
| 10384 |
+
}
|
| 10385 |
+
|
| 10386 |
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 10387 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
| 10388 |
}
|
|
|
|
| 10484 |
case GGML_UNARY_OP_RELU:
|
| 10485 |
func = ggml_cuda_relu;
|
| 10486 |
break;
|
| 10487 |
+
case GGML_UNARY_OP_HARDSIGMOID:
|
| 10488 |
+
func = ggml_cuda_hardsigmoid;
|
| 10489 |
+
break;
|
| 10490 |
+
case GGML_UNARY_OP_HARDSWISH:
|
| 10491 |
+
func = ggml_cuda_hardswish;
|
| 10492 |
+
break;
|
| 10493 |
default:
|
| 10494 |
return false;
|
| 10495 |
}
|
|
|
|
| 10564 |
case GGML_OP_IM2COL:
|
| 10565 |
func = ggml_cuda_im2col;
|
| 10566 |
break;
|
| 10567 |
+
case GGML_OP_POOL_2D:
|
| 10568 |
+
func = ggml_cuda_pool2d;
|
| 10569 |
+
break;
|
| 10570 |
case GGML_OP_SUM_ROWS:
|
| 10571 |
func = ggml_cuda_sum_rows;
|
| 10572 |
break;
|
|
|
|
| 11295 |
case GGML_UNARY_OP_GELU:
|
| 11296 |
case GGML_UNARY_OP_SILU:
|
| 11297 |
case GGML_UNARY_OP_RELU:
|
| 11298 |
+
case GGML_UNARY_OP_HARDSIGMOID:
|
| 11299 |
+
case GGML_UNARY_OP_HARDSWISH:
|
| 11300 |
case GGML_UNARY_OP_GELU_QUICK:
|
| 11301 |
case GGML_UNARY_OP_TANH:
|
| 11302 |
return true;
|
|
|
|
| 11395 |
case GGML_OP_ROPE:
|
| 11396 |
case GGML_OP_ALIBI:
|
| 11397 |
case GGML_OP_IM2COL:
|
| 11398 |
+
case GGML_OP_POOL_2D:
|
| 11399 |
case GGML_OP_SUM_ROWS:
|
| 11400 |
case GGML_OP_ARGSORT:
|
| 11401 |
case GGML_OP_ACC:
|
|
@@ -5349,7 +5349,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
| 5349 |
int s0,
|
| 5350 |
int p0,
|
| 5351 |
int d0) {
|
| 5352 |
-
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
|
| 5353 |
|
| 5354 |
struct ggml_tensor * result =
|
| 5355 |
ggml_mul_mat(ctx,
|
|
@@ -5427,16 +5427,15 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
|
|
| 5427 |
int p1,
|
| 5428 |
int d0,
|
| 5429 |
int d1) {
|
|
|
|
| 5430 |
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
| 5431 |
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
| 5432 |
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
| 5433 |
-
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
|
| 5434 |
-
|
| 5435 |
-
struct ggml_tensor * result =
|
| 5436 |
-
ggml_mul_mat(ctx,
|
| 5437 |
-
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
| 5438 |
-
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
| 5439 |
|
|
|
|
|
|
|
| 5440 |
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
| 5441 |
|
| 5442 |
return result;
|
|
@@ -5457,7 +5456,8 @@ struct ggml_tensor * ggml_im2col(
|
|
| 5457 |
int p1,
|
| 5458 |
int d0,
|
| 5459 |
int d1,
|
| 5460 |
-
bool is_2D
|
|
|
|
| 5461 |
|
| 5462 |
if(is_2D) {
|
| 5463 |
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
|
@@ -5481,7 +5481,7 @@ struct ggml_tensor * ggml_im2col(
|
|
| 5481 |
is_2D ? b->ne[3] : 1,
|
| 5482 |
};
|
| 5483 |
|
| 5484 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx,
|
| 5485 |
int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
|
| 5486 |
ggml_set_op_params(result, params, sizeof(params));
|
| 5487 |
|
|
@@ -5506,7 +5506,7 @@ struct ggml_tensor * ggml_conv_2d(
|
|
| 5506 |
int p1,
|
| 5507 |
int d0,
|
| 5508 |
int d1) {
|
| 5509 |
-
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
|
| 5510 |
|
| 5511 |
struct ggml_tensor * result =
|
| 5512 |
ggml_mul_mat(ctx,
|
|
@@ -5632,12 +5632,13 @@ struct ggml_tensor * ggml_pool_2d(
|
|
| 5632 |
is_node = true;
|
| 5633 |
}
|
| 5634 |
|
|
|
|
| 5635 |
const int64_t ne[3] = {
|
| 5636 |
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
| 5637 |
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
| 5638 |
a->ne[2],
|
| 5639 |
};
|
| 5640 |
-
|
| 5641 |
|
| 5642 |
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
| 5643 |
ggml_set_op_params(result, params, sizeof(params));
|
|
@@ -5645,7 +5646,6 @@ struct ggml_tensor * ggml_pool_2d(
|
|
| 5645 |
result->op = GGML_OP_POOL_2D;
|
| 5646 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 5647 |
result->src[0] = a;
|
| 5648 |
-
|
| 5649 |
return result;
|
| 5650 |
}
|
| 5651 |
|
|
@@ -12493,6 +12493,92 @@ static void ggml_compute_forward_conv_transpose_1d(
|
|
| 12493 |
}
|
| 12494 |
}
|
| 12495 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12496 |
// src0: kernel [OC, IC, KH, KW]
|
| 12497 |
// src1: image [N, IC, IH, IW]
|
| 12498 |
// dst: result [N, OH, OW, IC*KH*KW]
|
|
@@ -12583,14 +12669,14 @@ static void ggml_compute_forward_im2col(
|
|
| 12583 |
const struct ggml_tensor * src0,
|
| 12584 |
const struct ggml_tensor * src1,
|
| 12585 |
struct ggml_tensor * dst) {
|
| 12586 |
-
switch (
|
| 12587 |
case GGML_TYPE_F16:
|
| 12588 |
{
|
| 12589 |
ggml_compute_forward_im2col_f16(params, src0, src1, dst);
|
| 12590 |
} break;
|
| 12591 |
case GGML_TYPE_F32:
|
| 12592 |
{
|
| 12593 |
-
|
| 12594 |
} break;
|
| 12595 |
default:
|
| 12596 |
{
|
|
@@ -12781,8 +12867,8 @@ static void ggml_compute_forward_pool_2d(
|
|
| 12781 |
const struct ggml_compute_params * params,
|
| 12782 |
const struct ggml_tensor * src,
|
| 12783 |
struct ggml_tensor * dst) {
|
| 12784 |
-
|
| 12785 |
-
|
| 12786 |
|
| 12787 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 12788 |
return;
|
|
|
|
| 5349 |
int s0,
|
| 5350 |
int p0,
|
| 5351 |
int d0) {
|
| 5352 |
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
|
| 5353 |
|
| 5354 |
struct ggml_tensor * result =
|
| 5355 |
ggml_mul_mat(ctx,
|
|
|
|
| 5427 |
int p1,
|
| 5428 |
int d0,
|
| 5429 |
int d1) {
|
| 5430 |
+
|
| 5431 |
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
| 5432 |
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
| 5433 |
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
| 5434 |
+
s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
|
| 5435 |
+
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5436 |
|
| 5437 |
+
new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
| 5438 |
+
struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
|
| 5439 |
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
| 5440 |
|
| 5441 |
return result;
|
|
|
|
| 5456 |
int p1,
|
| 5457 |
int d0,
|
| 5458 |
int d1,
|
| 5459 |
+
bool is_2D,
|
| 5460 |
+
enum ggml_type dst_type) {
|
| 5461 |
|
| 5462 |
if(is_2D) {
|
| 5463 |
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
|
|
|
| 5481 |
is_2D ? b->ne[3] : 1,
|
| 5482 |
};
|
| 5483 |
|
| 5484 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
|
| 5485 |
int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
|
| 5486 |
ggml_set_op_params(result, params, sizeof(params));
|
| 5487 |
|
|
|
|
| 5506 |
int p1,
|
| 5507 |
int d0,
|
| 5508 |
int d1) {
|
| 5509 |
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
|
| 5510 |
|
| 5511 |
struct ggml_tensor * result =
|
| 5512 |
ggml_mul_mat(ctx,
|
|
|
|
| 5632 |
is_node = true;
|
| 5633 |
}
|
| 5634 |
|
| 5635 |
+
struct ggml_tensor * result;
|
| 5636 |
const int64_t ne[3] = {
|
| 5637 |
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
| 5638 |
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
| 5639 |
a->ne[2],
|
| 5640 |
};
|
| 5641 |
+
result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
| 5642 |
|
| 5643 |
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
| 5644 |
ggml_set_op_params(result, params, sizeof(params));
|
|
|
|
| 5646 |
result->op = GGML_OP_POOL_2D;
|
| 5647 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 5648 |
result->src[0] = a;
|
|
|
|
| 5649 |
return result;
|
| 5650 |
}
|
| 5651 |
|
|
|
|
| 12493 |
}
|
| 12494 |
}
|
| 12495 |
|
| 12496 |
+
// src0: kernel [OC, IC, KH, KW]
|
| 12497 |
+
// src1: image [N, IC, IH, IW]
|
| 12498 |
+
// dst: result [N, OH, OW, IC*KH*KW]
|
| 12499 |
+
static void ggml_compute_forward_im2col_f32(
|
| 12500 |
+
const struct ggml_compute_params * params,
|
| 12501 |
+
const struct ggml_tensor * src0,
|
| 12502 |
+
const struct ggml_tensor * src1,
|
| 12503 |
+
struct ggml_tensor * dst) {
|
| 12504 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 12505 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 12506 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 12507 |
+
|
| 12508 |
+
int64_t t0 = ggml_perf_time_us();
|
| 12509 |
+
UNUSED(t0);
|
| 12510 |
+
|
| 12511 |
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
| 12512 |
+
|
| 12513 |
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
| 12514 |
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
| 12515 |
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
| 12516 |
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
| 12517 |
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
| 12518 |
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
| 12519 |
+
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
| 12520 |
+
|
| 12521 |
+
const int ith = params->ith;
|
| 12522 |
+
const int nth = params->nth;
|
| 12523 |
+
|
| 12524 |
+
const int64_t N = is_2D ? ne13 : ne12;
|
| 12525 |
+
const int64_t IC = is_2D ? ne12 : ne11;
|
| 12526 |
+
const int64_t IH = is_2D ? ne11 : 1;
|
| 12527 |
+
const int64_t IW = ne10;
|
| 12528 |
+
|
| 12529 |
+
const int64_t KH = is_2D ? ne01 : 1;
|
| 12530 |
+
const int64_t KW = ne00;
|
| 12531 |
+
|
| 12532 |
+
const int64_t OH = is_2D ? ne2 : 1;
|
| 12533 |
+
const int64_t OW = ne1;
|
| 12534 |
+
|
| 12535 |
+
int ofs0 = is_2D ? nb13 : nb12;
|
| 12536 |
+
int ofs1 = is_2D ? nb12 : nb11;
|
| 12537 |
+
|
| 12538 |
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
| 12539 |
+
GGML_ASSERT(nb10 == sizeof(float));
|
| 12540 |
+
|
| 12541 |
+
if (params->type == GGML_TASK_INIT) {
|
| 12542 |
+
return;
|
| 12543 |
+
}
|
| 12544 |
+
|
| 12545 |
+
if (params->type == GGML_TASK_FINALIZE) {
|
| 12546 |
+
return;
|
| 12547 |
+
}
|
| 12548 |
+
|
| 12549 |
+
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
| 12550 |
+
{
|
| 12551 |
+
float * const wdata = (float *) dst->data;
|
| 12552 |
+
|
| 12553 |
+
for (int64_t in = 0; in < N; in++) {
|
| 12554 |
+
for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
|
| 12555 |
+
for (int64_t iow = 0; iow < OW; iow++) {
|
| 12556 |
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
| 12557 |
+
|
| 12558 |
+
// micro kernel
|
| 12559 |
+
float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
| 12560 |
+
const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
|
| 12561 |
+
|
| 12562 |
+
for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
|
| 12563 |
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
| 12564 |
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
| 12565 |
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
| 12566 |
+
|
| 12567 |
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
| 12568 |
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
| 12569 |
+
} else {
|
| 12570 |
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
|
| 12571 |
+
}
|
| 12572 |
+
}
|
| 12573 |
+
}
|
| 12574 |
+
}
|
| 12575 |
+
}
|
| 12576 |
+
}
|
| 12577 |
+
}
|
| 12578 |
+
}
|
| 12579 |
+
}
|
| 12580 |
+
|
| 12581 |
+
|
| 12582 |
// src0: kernel [OC, IC, KH, KW]
|
| 12583 |
// src1: image [N, IC, IH, IW]
|
| 12584 |
// dst: result [N, OH, OW, IC*KH*KW]
|
|
|
|
| 12669 |
const struct ggml_tensor * src0,
|
| 12670 |
const struct ggml_tensor * src1,
|
| 12671 |
struct ggml_tensor * dst) {
|
| 12672 |
+
switch (dst->type) {
|
| 12673 |
case GGML_TYPE_F16:
|
| 12674 |
{
|
| 12675 |
ggml_compute_forward_im2col_f16(params, src0, src1, dst);
|
| 12676 |
} break;
|
| 12677 |
case GGML_TYPE_F32:
|
| 12678 |
{
|
| 12679 |
+
ggml_compute_forward_im2col_f32(params, src0, src1, dst);
|
| 12680 |
} break;
|
| 12681 |
default:
|
| 12682 |
{
|
|
|
|
| 12867 |
const struct ggml_compute_params * params,
|
| 12868 |
const struct ggml_tensor * src,
|
| 12869 |
struct ggml_tensor * dst) {
|
| 12870 |
+
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
| 12871 |
+
GGML_ASSERT(params->ith == 0);
|
| 12872 |
|
| 12873 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 12874 |
return;
|
|
@@ -1500,7 +1500,8 @@ extern "C" {
|
|
| 1500 |
int p1,
|
| 1501 |
int d0,
|
| 1502 |
int d1,
|
| 1503 |
-
bool is_2D
|
|
|
|
| 1504 |
|
| 1505 |
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
| 1506 |
struct ggml_context * ctx,
|
|
|
|
| 1500 |
int p1,
|
| 1501 |
int d0,
|
| 1502 |
int d1,
|
| 1503 |
+
bool is_2D,
|
| 1504 |
+
enum ggml_type dst_type);
|
| 1505 |
|
| 1506 |
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
| 1507 |
struct ggml_context * ctx,
|