Spaces:
Running
add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)
Browse files* cuda: fix group_norm
* cuda: add batch inference support for ggml_pad/ggml_upscale
* add ggml_arrange
* add ggml_timestep_embedding
* update ggml_arange/ggml_timestep_embedding tests
* cuda: fix im2col
* add ggml_arange/ggml_timestep_embbeding support for metal backend
* fix some bugs
* fix some bugs
* Update ggml.h
Co-authored-by: Georgi Gerganov <[email protected]>
* Update ggml-cuda.cu
Co-authored-by: Georgi Gerganov <[email protected]>
* Update ggml-metal.m
Co-authored-by: Georgi Gerganov <[email protected]>
* Update ggml-metal.m
Co-authored-by: Georgi Gerganov <[email protected]>
* Update ggml-metal.metal
Co-authored-by: Georgi Gerganov <[email protected]>
* modify according to the review comments
* ggml : fix compile warnings + code style
* ggml : normalize compute_forward calls + fix seg fault in debug
* minor
---------
Co-authored-by: Georgi Gerganov <[email protected]>
Co-authored-by: slaren <[email protected]>
- ggml-cuda.cu +187 -42
- ggml-metal.m +59 -3
- ggml-metal.metal +43 -0
- ggml.c +199 -8
- ggml.h +17 -0
|
@@ -616,6 +616,8 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
|
|
| 616 |
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
| 617 |
#define CUDA_CONCAT_BLOCK_SIZE 256
|
| 618 |
#define CUDA_PAD_BLOCK_SIZE 256
|
|
|
|
|
|
|
| 619 |
#define CUDA_ACC_BLOCK_SIZE 256
|
| 620 |
#define CUDA_IM2COL_BLOCK_SIZE 256
|
| 621 |
#define CUDA_POOL2D_BLOCK_SIZE 256
|
|
@@ -990,17 +992,21 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
|
|
| 990 |
nidx +
|
| 991 |
blockIdx.y * ne0 +
|
| 992 |
blockIdx.z * ne0 * gridDim.y;
|
| 993 |
-
|
| 994 |
} else {
|
| 995 |
int offset_src =
|
| 996 |
nidx +
|
| 997 |
blockIdx.y * ne0 +
|
| 998 |
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
| 999 |
-
|
| 1000 |
}
|
| 1001 |
}
|
| 1002 |
|
| 1003 |
-
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1004 |
int ne0 = ne00 * scale_factor;
|
| 1005 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 1006 |
if (nidx >= ne0) {
|
|
@@ -1012,7 +1018,7 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
|
|
| 1012 |
int offset_src =
|
| 1013 |
i00 +
|
| 1014 |
i01 * ne00 +
|
| 1015 |
-
blockIdx.z *
|
| 1016 |
int offset_dst =
|
| 1017 |
nidx +
|
| 1018 |
blockIdx.y * ne0 +
|
|
@@ -1020,7 +1026,10 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
|
|
| 1020 |
dst[offset_dst] = x[offset_src];
|
| 1021 |
}
|
| 1022 |
|
| 1023 |
-
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
|
|
|
|
|
|
|
|
|
| 1024 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 1025 |
if (nidx >= ne0) {
|
| 1026 |
return;
|
|
@@ -1031,19 +1040,53 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
|
|
| 1031 |
nidx +
|
| 1032 |
blockIdx.y * ne0 +
|
| 1033 |
blockIdx.z * ne0 * gridDim.y;
|
| 1034 |
-
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
|
| 1035 |
int offset_src =
|
| 1036 |
nidx +
|
| 1037 |
blockIdx.y * ne00 +
|
| 1038 |
blockIdx.z * ne00 * ne01;
|
| 1039 |
-
|
| 1040 |
} else {
|
| 1041 |
dst[offset_dst] = 0.0f;
|
| 1042 |
}
|
| 1043 |
}
|
| 1044 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1045 |
template <int block_size>
|
| 1046 |
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
|
|
|
|
|
|
| 1047 |
int start = blockIdx.x * group_size;
|
| 1048 |
int end = start + group_size;
|
| 1049 |
|
|
@@ -6449,7 +6492,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
| 6449 |
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 6450 |
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
| 6451 |
const int nb12, const int nb13) {
|
| 6452 |
-
const
|
| 6453 |
|
| 6454 |
if (i >= ne) {
|
| 6455 |
return;
|
|
@@ -6457,17 +6500,17 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
| 6457 |
|
| 6458 |
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
| 6459 |
// then combine those indices with the corresponding byte offsets to get the total offsets
|
| 6460 |
-
const
|
| 6461 |
-
const
|
| 6462 |
-
const
|
| 6463 |
-
const
|
| 6464 |
-
const
|
| 6465 |
-
|
| 6466 |
-
const
|
| 6467 |
-
const
|
| 6468 |
-
const
|
| 6469 |
-
const
|
| 6470 |
-
const
|
| 6471 |
|
| 6472 |
cpy_1(cx + x_offset, cdst + dst_offset);
|
| 6473 |
}
|
|
@@ -6956,23 +6999,23 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
| 6956 |
|
| 6957 |
template <typename T>
|
| 6958 |
static __global__ void im2col_kernel(
|
| 6959 |
-
const float * x, T * dst,
|
| 6960 |
-
|
| 6961 |
int s0, int s1, int p0, int p1, int d0, int d1) {
|
| 6962 |
-
const
|
| 6963 |
if (i >= pelements) {
|
| 6964 |
return;
|
| 6965 |
}
|
| 6966 |
|
| 6967 |
-
const
|
| 6968 |
-
const
|
| 6969 |
-
const
|
| 6970 |
-
const
|
| 6971 |
-
const
|
| 6972 |
|
| 6973 |
-
const
|
| 6974 |
-
const
|
| 6975 |
-
const
|
| 6976 |
|
| 6977 |
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
| 6978 |
const int64_t iih = oh * s1 + ky * d1 - p1;
|
|
@@ -7298,19 +7341,33 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, const
|
|
| 7298 |
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
| 7299 |
}
|
| 7300 |
|
| 7301 |
-
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int
|
|
|
|
| 7302 |
int ne0 = (ne00 * scale_factor);
|
| 7303 |
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
| 7304 |
-
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
|
| 7305 |
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
| 7306 |
}
|
| 7307 |
|
| 7308 |
static void pad_f32_cuda(const float * x, float * dst,
|
| 7309 |
-
const int ne00, const int ne01, const int ne02,
|
| 7310 |
-
const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
|
| 7311 |
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
| 7312 |
-
dim3 gridDim(num_blocks, ne1, ne2);
|
| 7313 |
-
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7314 |
}
|
| 7315 |
|
| 7316 |
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
|
@@ -8443,8 +8500,8 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float *
|
|
| 8443 |
|
| 8444 |
template <typename T>
|
| 8445 |
static void im2col_cuda(const float* x, T* dst,
|
| 8446 |
-
|
| 8447 |
-
|
| 8448 |
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
| 8449 |
const int parallel_elements = OW * KW * KH;
|
| 8450 |
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
|
@@ -9123,7 +9180,7 @@ static void ggml_cuda_op_group_norm(
|
|
| 9123 |
|
| 9124 |
int num_groups = dst->op_params[0];
|
| 9125 |
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
| 9126 |
-
group_norm_f32_cuda(src0_dd, dst_dd, num_groups
|
| 9127 |
|
| 9128 |
(void) src1;
|
| 9129 |
(void) dst;
|
|
@@ -9156,7 +9213,7 @@ static void ggml_cuda_op_upscale(
|
|
| 9156 |
|
| 9157 |
const int scale_factor = dst->op_params[0];
|
| 9158 |
|
| 9159 |
-
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
| 9160 |
|
| 9161 |
(void) src1;
|
| 9162 |
(void) dst;
|
|
@@ -9172,8 +9229,49 @@ static void ggml_cuda_op_pad(
|
|
| 9172 |
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
| 9173 |
|
| 9174 |
pad_f32_cuda(src0_dd, dst_dd,
|
| 9175 |
-
src0->ne[0], src0->ne[1], src0->ne[2],
|
| 9176 |
-
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9177 |
|
| 9178 |
(void) src1;
|
| 9179 |
(void) dst;
|
|
@@ -10458,6 +10556,45 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
| 10458 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
| 10459 |
}
|
| 10460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10461 |
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 10462 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
| 10463 |
}
|
|
@@ -11358,6 +11495,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
| 11358 |
case GGML_OP_PAD:
|
| 11359 |
func = ggml_cuda_pad;
|
| 11360 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11361 |
case GGML_OP_LEAKY_RELU:
|
| 11362 |
func = ggml_cuda_leaky_relu;
|
| 11363 |
break;
|
|
@@ -12253,6 +12396,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 12253 |
case GGML_OP_GROUP_NORM:
|
| 12254 |
case GGML_OP_UPSCALE:
|
| 12255 |
case GGML_OP_PAD:
|
|
|
|
|
|
|
| 12256 |
case GGML_OP_LEAKY_RELU:
|
| 12257 |
return true;
|
| 12258 |
default:
|
|
|
|
| 616 |
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
| 617 |
#define CUDA_CONCAT_BLOCK_SIZE 256
|
| 618 |
#define CUDA_PAD_BLOCK_SIZE 256
|
| 619 |
+
#define CUDA_ARANGE_BLOCK_SIZE 256
|
| 620 |
+
#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
|
| 621 |
#define CUDA_ACC_BLOCK_SIZE 256
|
| 622 |
#define CUDA_IM2COL_BLOCK_SIZE 256
|
| 623 |
#define CUDA_POOL2D_BLOCK_SIZE 256
|
|
|
|
| 992 |
nidx +
|
| 993 |
blockIdx.y * ne0 +
|
| 994 |
blockIdx.z * ne0 * gridDim.y;
|
| 995 |
+
dst[offset_dst] = x[offset_src];
|
| 996 |
} else {
|
| 997 |
int offset_src =
|
| 998 |
nidx +
|
| 999 |
blockIdx.y * ne0 +
|
| 1000 |
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
| 1001 |
+
dst[offset_dst] = y[offset_src];
|
| 1002 |
}
|
| 1003 |
}
|
| 1004 |
|
| 1005 |
+
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
|
| 1006 |
+
// blockIdx.z: idx of ne02*ne03
|
| 1007 |
+
// blockIdx.y: idx of ne01*scale_factor, aka ne1
|
| 1008 |
+
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
|
| 1009 |
+
// ne00xne01: ne00 * ne01
|
| 1010 |
int ne0 = ne00 * scale_factor;
|
| 1011 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 1012 |
if (nidx >= ne0) {
|
|
|
|
| 1018 |
int offset_src =
|
| 1019 |
i00 +
|
| 1020 |
i01 * ne00 +
|
| 1021 |
+
blockIdx.z * ne00xne01;
|
| 1022 |
int offset_dst =
|
| 1023 |
nidx +
|
| 1024 |
blockIdx.y * ne0 +
|
|
|
|
| 1026 |
dst[offset_dst] = x[offset_src];
|
| 1027 |
}
|
| 1028 |
|
| 1029 |
+
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
| 1030 |
+
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
| 1031 |
+
// blockIdx.y: idx of ne1
|
| 1032 |
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
| 1033 |
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 1034 |
if (nidx >= ne0) {
|
| 1035 |
return;
|
|
|
|
| 1040 |
nidx +
|
| 1041 |
blockIdx.y * ne0 +
|
| 1042 |
blockIdx.z * ne0 * gridDim.y;
|
| 1043 |
+
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
| 1044 |
int offset_src =
|
| 1045 |
nidx +
|
| 1046 |
blockIdx.y * ne00 +
|
| 1047 |
blockIdx.z * ne00 * ne01;
|
| 1048 |
+
dst[offset_dst] = x[offset_src];
|
| 1049 |
} else {
|
| 1050 |
dst[offset_dst] = 0.0f;
|
| 1051 |
}
|
| 1052 |
}
|
| 1053 |
|
| 1054 |
+
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
| 1055 |
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
| 1056 |
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 1057 |
+
if (nidx >= ne0) {
|
| 1058 |
+
return;
|
| 1059 |
+
}
|
| 1060 |
+
dst[nidx] = start + step * nidx;
|
| 1061 |
+
}
|
| 1062 |
+
|
| 1063 |
+
static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
|
| 1064 |
+
// blockIDx.y: idx of timesteps->ne[0]
|
| 1065 |
+
// blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
|
| 1066 |
+
int i = blockIdx.y;
|
| 1067 |
+
int j = threadIdx.x + blockIdx.x * blockDim.x;
|
| 1068 |
+
float * embed_data = (float *)((char *)dst + i*nb1);
|
| 1069 |
+
|
| 1070 |
+
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
|
| 1071 |
+
embed_data[dim] = 0.f;
|
| 1072 |
+
}
|
| 1073 |
+
|
| 1074 |
+
int half = dim / 2;
|
| 1075 |
+
if (j >= half) {
|
| 1076 |
+
return;
|
| 1077 |
+
}
|
| 1078 |
+
|
| 1079 |
+
float timestep = timesteps[i];
|
| 1080 |
+
float freq = (float)expf(-logf(max_period) * j / half);
|
| 1081 |
+
float arg = timestep * freq;
|
| 1082 |
+
embed_data[j] = cosf(arg);
|
| 1083 |
+
embed_data[j + half] = sinf(arg);
|
| 1084 |
+
}
|
| 1085 |
+
|
| 1086 |
template <int block_size>
|
| 1087 |
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
| 1088 |
+
// blockIdx.x: num_groups idx
|
| 1089 |
+
// threadIdx.x: block_size idx
|
| 1090 |
int start = blockIdx.x * group_size;
|
| 1091 |
int end = start + group_size;
|
| 1092 |
|
|
|
|
| 6492 |
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
| 6493 |
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
| 6494 |
const int nb12, const int nb13) {
|
| 6495 |
+
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 6496 |
|
| 6497 |
if (i >= ne) {
|
| 6498 |
return;
|
|
|
|
| 6500 |
|
| 6501 |
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
| 6502 |
// then combine those indices with the corresponding byte offsets to get the total offsets
|
| 6503 |
+
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
| 6504 |
+
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
| 6505 |
+
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
| 6506 |
+
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
| 6507 |
+
const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
| 6508 |
+
|
| 6509 |
+
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
| 6510 |
+
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
| 6511 |
+
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
| 6512 |
+
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
| 6513 |
+
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
| 6514 |
|
| 6515 |
cpy_1(cx + x_offset, cdst + dst_offset);
|
| 6516 |
}
|
|
|
|
| 6999 |
|
| 7000 |
template <typename T>
|
| 7001 |
static __global__ void im2col_kernel(
|
| 7002 |
+
const float * x, T * dst, int64_t batch_offset,
|
| 7003 |
+
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
|
| 7004 |
int s0, int s1, int p0, int p1, int d0, int d1) {
|
| 7005 |
+
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
| 7006 |
if (i >= pelements) {
|
| 7007 |
return;
|
| 7008 |
}
|
| 7009 |
|
| 7010 |
+
const int64_t ksize = OW * (KH > 1 ? KW : 1);
|
| 7011 |
+
const int64_t kx = i / ksize;
|
| 7012 |
+
const int64_t kd = kx * ksize;
|
| 7013 |
+
const int64_t ky = (i - kd) / OW;
|
| 7014 |
+
const int64_t ix = i % OW;
|
| 7015 |
|
| 7016 |
+
const int64_t oh = blockIdx.y;
|
| 7017 |
+
const int64_t batch = blockIdx.z / IC;
|
| 7018 |
+
const int64_t ic = blockIdx.z % IC;
|
| 7019 |
|
| 7020 |
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
| 7021 |
const int64_t iih = oh * s1 + ky * d1 - p1;
|
|
|
|
| 7341 |
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
| 7342 |
}
|
| 7343 |
|
| 7344 |
+
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
|
| 7345 |
+
const int scale_factor, cudaStream_t stream) {
|
| 7346 |
int ne0 = (ne00 * scale_factor);
|
| 7347 |
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
| 7348 |
+
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
|
| 7349 |
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
| 7350 |
}
|
| 7351 |
|
| 7352 |
static void pad_f32_cuda(const float * x, float * dst,
|
| 7353 |
+
const int ne00, const int ne01, const int ne02, const int ne03,
|
| 7354 |
+
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
| 7355 |
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
| 7356 |
+
dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
| 7357 |
+
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
| 7358 |
+
}
|
| 7359 |
+
|
| 7360 |
+
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
| 7361 |
+
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
| 7362 |
+
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
| 7363 |
+
}
|
| 7364 |
+
|
| 7365 |
+
static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
|
| 7366 |
+
const int dim, const int max_period, cudaStream_t stream) {
|
| 7367 |
+
int half_ceil = (dim + 1) / 2;
|
| 7368 |
+
int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
|
| 7369 |
+
dim3 gridDim(num_blocks, ne00, 1);
|
| 7370 |
+
timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
|
| 7371 |
}
|
| 7372 |
|
| 7373 |
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
|
|
|
| 8500 |
|
| 8501 |
template <typename T>
|
| 8502 |
static void im2col_cuda(const float* x, T* dst,
|
| 8503 |
+
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
| 8504 |
+
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
| 8505 |
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
| 8506 |
const int parallel_elements = OW * KW * KH;
|
| 8507 |
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
|
|
|
| 9180 |
|
| 9181 |
int num_groups = dst->op_params[0];
|
| 9182 |
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
| 9183 |
+
group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
|
| 9184 |
|
| 9185 |
(void) src1;
|
| 9186 |
(void) dst;
|
|
|
|
| 9213 |
|
| 9214 |
const int scale_factor = dst->op_params[0];
|
| 9215 |
|
| 9216 |
+
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
|
| 9217 |
|
| 9218 |
(void) src1;
|
| 9219 |
(void) dst;
|
|
|
|
| 9229 |
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
| 9230 |
|
| 9231 |
pad_f32_cuda(src0_dd, dst_dd,
|
| 9232 |
+
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
| 9233 |
+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
|
| 9234 |
+
|
| 9235 |
+
(void) src1;
|
| 9236 |
+
(void) dst;
|
| 9237 |
+
(void) src1_dd;
|
| 9238 |
+
}
|
| 9239 |
+
|
| 9240 |
+
static void ggml_cuda_op_arange(
|
| 9241 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 9242 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 9243 |
+
|
| 9244 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 9245 |
+
|
| 9246 |
+
float start;
|
| 9247 |
+
float stop;
|
| 9248 |
+
float step;
|
| 9249 |
+
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
| 9250 |
+
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
| 9251 |
+
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
| 9252 |
+
|
| 9253 |
+
int64_t steps = (int64_t)ceil((stop - start) / step);
|
| 9254 |
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
| 9255 |
+
|
| 9256 |
+
arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
|
| 9257 |
+
|
| 9258 |
+
(void) src0;
|
| 9259 |
+
(void) src1;
|
| 9260 |
+
(void) src0_dd;
|
| 9261 |
+
(void) src1_dd;
|
| 9262 |
+
}
|
| 9263 |
+
|
| 9264 |
+
static void ggml_cuda_op_timestep_embedding(
|
| 9265 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 9266 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
| 9267 |
+
|
| 9268 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 9269 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 9270 |
+
|
| 9271 |
+
const int dim = dst->op_params[0];
|
| 9272 |
+
const int max_period = dst->op_params[1];
|
| 9273 |
+
|
| 9274 |
+
timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
|
| 9275 |
|
| 9276 |
(void) src1;
|
| 9277 |
(void) dst;
|
|
|
|
| 10556 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
| 10557 |
}
|
| 10558 |
|
| 10559 |
+
static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 10560 |
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
| 10561 |
+
|
| 10562 |
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
| 10563 |
+
|
| 10564 |
+
// dd = data device
|
| 10565 |
+
float * src0_ddf = nullptr;
|
| 10566 |
+
float * src1_ddf = nullptr;
|
| 10567 |
+
float * dst_ddf = nullptr;
|
| 10568 |
+
|
| 10569 |
+
cuda_pool_alloc<float> dst_f;
|
| 10570 |
+
|
| 10571 |
+
ggml_cuda_set_device(g_main_device);
|
| 10572 |
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
| 10573 |
+
|
| 10574 |
+
if (dst_on_device) {
|
| 10575 |
+
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
| 10576 |
+
} else {
|
| 10577 |
+
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
| 10578 |
+
}
|
| 10579 |
+
|
| 10580 |
+
// do the computation
|
| 10581 |
+
ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
| 10582 |
+
CUDA_CHECK(cudaGetLastError());
|
| 10583 |
+
|
| 10584 |
+
// copy dst to host if necessary
|
| 10585 |
+
if (!dst_on_device) {
|
| 10586 |
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
| 10587 |
+
}
|
| 10588 |
+
|
| 10589 |
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
| 10590 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10591 |
+
}
|
| 10592 |
+
}
|
| 10593 |
+
|
| 10594 |
+
static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 10595 |
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
|
| 10596 |
+
}
|
| 10597 |
+
|
| 10598 |
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 10599 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
| 10600 |
}
|
|
|
|
| 11495 |
case GGML_OP_PAD:
|
| 11496 |
func = ggml_cuda_pad;
|
| 11497 |
break;
|
| 11498 |
+
case GGML_OP_ARANGE:
|
| 11499 |
+
func = ggml_cuda_arange;
|
| 11500 |
+
break;
|
| 11501 |
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 11502 |
+
func = ggml_cuda_timestep_embedding;
|
| 11503 |
+
break;
|
| 11504 |
case GGML_OP_LEAKY_RELU:
|
| 11505 |
func = ggml_cuda_leaky_relu;
|
| 11506 |
break;
|
|
|
|
| 12396 |
case GGML_OP_GROUP_NORM:
|
| 12397 |
case GGML_OP_UPSCALE:
|
| 12398 |
case GGML_OP_PAD:
|
| 12399 |
+
case GGML_OP_ARANGE:
|
| 12400 |
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 12401 |
case GGML_OP_LEAKY_RELU:
|
| 12402 |
return true;
|
| 12403 |
default:
|
|
@@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
|
|
| 163 |
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
| 164 |
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
| 165 |
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
|
|
|
|
|
|
| 166 |
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
| 167 |
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
|
| 168 |
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
|
|
@@ -569,6 +571,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 569 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
| 570 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
| 571 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
|
|
|
|
|
|
| 572 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
| 573 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
| 574 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
|
@@ -697,6 +701,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|
| 697 |
return false;
|
| 698 |
case GGML_OP_UPSCALE:
|
| 699 |
case GGML_OP_PAD:
|
|
|
|
|
|
|
| 700 |
case GGML_OP_ARGSORT:
|
| 701 |
case GGML_OP_LEAKY_RELU:
|
| 702 |
return true;
|
|
@@ -1091,7 +1097,8 @@ static bool ggml_metal_graph_compute(
|
|
| 1091 |
{
|
| 1092 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 1093 |
|
| 1094 |
-
|
|
|
|
| 1095 |
|
| 1096 |
int64_t n = ggml_nelements(dst);
|
| 1097 |
|
|
@@ -1250,11 +1257,15 @@ static bool ggml_metal_graph_compute(
|
|
| 1250 |
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
|
| 1251 |
}
|
| 1252 |
|
| 1253 |
-
|
| 1254 |
-
|
|
|
|
|
|
|
|
|
|
| 1255 |
|
| 1256 |
const int64_t nrows_x = ggml_nrows(src0);
|
| 1257 |
const int64_t nrows_y = src0->ne[1];
|
|
|
|
| 1258 |
const uint32_t n_head_kv = nrows_x/nrows_y;
|
| 1259 |
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
| 1260 |
|
|
@@ -2086,6 +2097,7 @@ static bool ggml_metal_graph_compute(
|
|
| 2086 |
|
| 2087 |
//const int n_past = ((int32_t *) dst->op_params)[0];
|
| 2088 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
|
|
|
| 2089 |
float max_bias;
|
| 2090 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
| 2091 |
|
|
@@ -2300,6 +2312,50 @@ static bool ggml_metal_graph_compute(
|
|
| 2300 |
|
| 2301 |
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
| 2302 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2303 |
case GGML_OP_ARGSORT:
|
| 2304 |
{
|
| 2305 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
|
|
| 163 |
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
| 164 |
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
| 165 |
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
| 166 |
+
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
| 167 |
+
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
| 168 |
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
| 169 |
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
|
| 170 |
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
|
|
|
|
| 571 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
| 572 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
| 573 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
| 574 |
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
| 575 |
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
| 576 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
| 577 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
| 578 |
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
|
|
|
| 701 |
return false;
|
| 702 |
case GGML_OP_UPSCALE:
|
| 703 |
case GGML_OP_PAD:
|
| 704 |
+
case GGML_OP_ARANGE:
|
| 705 |
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 706 |
case GGML_OP_ARGSORT:
|
| 707 |
case GGML_OP_LEAKY_RELU:
|
| 708 |
return true;
|
|
|
|
| 1097 |
{
|
| 1098 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 1099 |
|
| 1100 |
+
float scale;
|
| 1101 |
+
memcpy(&scale, dst->op_params, sizeof(scale));
|
| 1102 |
|
| 1103 |
int64_t n = ggml_nelements(dst);
|
| 1104 |
|
|
|
|
| 1257 |
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
|
| 1258 |
}
|
| 1259 |
|
| 1260 |
+
float scale;
|
| 1261 |
+
float max_bias;
|
| 1262 |
+
|
| 1263 |
+
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
|
| 1264 |
+
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
|
| 1265 |
|
| 1266 |
const int64_t nrows_x = ggml_nrows(src0);
|
| 1267 |
const int64_t nrows_y = src0->ne[1];
|
| 1268 |
+
|
| 1269 |
const uint32_t n_head_kv = nrows_x/nrows_y;
|
| 1270 |
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
| 1271 |
|
|
|
|
| 2097 |
|
| 2098 |
//const int n_past = ((int32_t *) dst->op_params)[0];
|
| 2099 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
| 2100 |
+
|
| 2101 |
float max_bias;
|
| 2102 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
| 2103 |
|
|
|
|
| 2312 |
|
| 2313 |
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
| 2314 |
} break;
|
| 2315 |
+
case GGML_OP_ARANGE:
|
| 2316 |
+
{
|
| 2317 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 2318 |
+
|
| 2319 |
+
float start;
|
| 2320 |
+
float step;
|
| 2321 |
+
|
| 2322 |
+
memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
| 2323 |
+
memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float));
|
| 2324 |
+
|
| 2325 |
+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
|
| 2326 |
+
|
| 2327 |
+
[encoder setComputePipelineState:pipeline];
|
| 2328 |
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
|
| 2329 |
+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
|
| 2330 |
+
[encoder setBytes:&start length:sizeof(start) atIndex:2];
|
| 2331 |
+
[encoder setBytes:&step length:sizeof(step) atIndex:3];
|
| 2332 |
+
|
| 2333 |
+
const int nth = MIN(1024, ne0);
|
| 2334 |
+
|
| 2335 |
+
[encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
| 2336 |
+
} break;
|
| 2337 |
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 2338 |
+
{
|
| 2339 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 2340 |
+
|
| 2341 |
+
const int dim = dst->op_params[0];
|
| 2342 |
+
const int max_period = dst->op_params[1];
|
| 2343 |
+
|
| 2344 |
+
const int half = dim / 2;
|
| 2345 |
+
|
| 2346 |
+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
|
| 2347 |
+
|
| 2348 |
+
[encoder setComputePipelineState:pipeline];
|
| 2349 |
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
| 2350 |
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
| 2351 |
+
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
|
| 2352 |
+
[encoder setBytes:&dim length:sizeof(dim) atIndex:3];
|
| 2353 |
+
[encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
|
| 2354 |
+
|
| 2355 |
+
const int nth = MIN(1024, half);
|
| 2356 |
+
|
| 2357 |
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
| 2358 |
+
} break;
|
| 2359 |
case GGML_OP_ARGSORT:
|
| 2360 |
{
|
| 2361 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
@@ -1959,6 +1959,49 @@ kernel void kernel_pad_f32(
|
|
| 1959 |
}
|
| 1960 |
}
|
| 1961 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1962 |
// bitonic sort implementation following the CUDA kernels as reference
|
| 1963 |
typedef void (argsort_t)(
|
| 1964 |
device const float * x,
|
|
|
|
| 1959 |
}
|
| 1960 |
}
|
| 1961 |
|
| 1962 |
+
kernel void kernel_arange_f32(
|
| 1963 |
+
device char * dst,
|
| 1964 |
+
constant int64_t & ne0,
|
| 1965 |
+
constant float & start,
|
| 1966 |
+
constant float & step,
|
| 1967 |
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 1968 |
+
uint3 tpitg[[thread_position_in_threadgroup]],
|
| 1969 |
+
uint3 ntg[[threads_per_threadgroup]]) {
|
| 1970 |
+
|
| 1971 |
+
device float * dst_ptr = (device float *) dst;
|
| 1972 |
+
|
| 1973 |
+
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
| 1974 |
+
dst_ptr[i0] = start + step * i0;
|
| 1975 |
+
}
|
| 1976 |
+
}
|
| 1977 |
+
|
| 1978 |
+
kernel void kernel_timestep_embedding_f32(
|
| 1979 |
+
device const char * src0,
|
| 1980 |
+
device char * dst,
|
| 1981 |
+
constant uint64_t & nb1,
|
| 1982 |
+
constant int & dim,
|
| 1983 |
+
constant int & max_period,
|
| 1984 |
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 1985 |
+
uint3 tpitg[[thread_position_in_threadgroup]],
|
| 1986 |
+
uint3 ntg[[threads_per_threadgroup]]) {
|
| 1987 |
+
|
| 1988 |
+
int i = tgpig.x;
|
| 1989 |
+
device float * embed_data = (device float *)(dst + i*nb1);
|
| 1990 |
+
|
| 1991 |
+
int half_ = dim / 2;
|
| 1992 |
+
for (int j = tpitg.x; j < half_; j += ntg.x) {
|
| 1993 |
+
float timestep = ((device float *)src0)[i];
|
| 1994 |
+
float freq = (float)exp(-log((float)max_period) * j / half_);
|
| 1995 |
+
float arg = timestep * freq;
|
| 1996 |
+
embed_data[j ] = cos(arg);
|
| 1997 |
+
embed_data[j + half_] = sin(arg);
|
| 1998 |
+
}
|
| 1999 |
+
|
| 2000 |
+
if (dim % 2 != 0 && tpitg.x == 0) {
|
| 2001 |
+
embed_data[dim] = 0.f;
|
| 2002 |
+
}
|
| 2003 |
+
}
|
| 2004 |
+
|
| 2005 |
// bitonic sort implementation following the CUDA kernels as reference
|
| 2006 |
typedef void (argsort_t)(
|
| 2007 |
device const float * x,
|
|
@@ -1822,6 +1822,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
| 1822 |
"POOL_2D",
|
| 1823 |
"UPSCALE",
|
| 1824 |
"PAD",
|
|
|
|
|
|
|
| 1825 |
"ARGSORT",
|
| 1826 |
"LEAKY_RELU",
|
| 1827 |
|
|
@@ -1850,7 +1852,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
| 1850 |
"CROSS_ENTROPY_LOSS_BACK",
|
| 1851 |
};
|
| 1852 |
|
| 1853 |
-
static_assert(GGML_OP_COUNT ==
|
| 1854 |
|
| 1855 |
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
| 1856 |
"none",
|
|
@@ -1908,6 +1910,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
| 1908 |
"pool_2d(x)",
|
| 1909 |
"upscale(x)",
|
| 1910 |
"pad(x)",
|
|
|
|
|
|
|
| 1911 |
"argsort(x)",
|
| 1912 |
"leaky_relu(x)",
|
| 1913 |
|
|
@@ -1936,7 +1940,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
| 1936 |
"cross_entropy_loss_back(x,y)",
|
| 1937 |
};
|
| 1938 |
|
| 1939 |
-
static_assert(GGML_OP_COUNT ==
|
| 1940 |
|
| 1941 |
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
| 1942 |
|
|
@@ -2895,11 +2899,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
|
|
| 2895 |
return ((const int32_t *)(tensor->op_params))[i];
|
| 2896 |
}
|
| 2897 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2898 |
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
| 2899 |
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
| 2900 |
((int32_t *)(tensor->op_params))[i] = value;
|
| 2901 |
}
|
| 2902 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2903 |
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
| 2904 |
memset(tensor->data, 0, ggml_nbytes(tensor));
|
| 2905 |
return tensor;
|
|
@@ -5898,6 +5912,55 @@ struct ggml_tensor * ggml_upscale(
|
|
| 5898 |
return ggml_upscale_impl(ctx, a, scale_factor);
|
| 5899 |
}
|
| 5900 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5901 |
// ggml_argsort
|
| 5902 |
|
| 5903 |
struct ggml_tensor * ggml_argsort(
|
|
@@ -10231,7 +10294,7 @@ static void ggml_compute_forward_group_norm_f32(
|
|
| 10231 |
int n_channels = src0->ne[2];
|
| 10232 |
int n_groups = dst->op_params[0];
|
| 10233 |
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
| 10234 |
-
for (int i = ith; i < n_groups; i+=nth) {
|
| 10235 |
int start = i * n_channels_per_group;
|
| 10236 |
int end = start + n_channels_per_group;
|
| 10237 |
if (end > n_channels) {
|
|
@@ -10245,28 +10308,32 @@ static void ggml_compute_forward_group_norm_f32(
|
|
| 10245 |
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
| 10246 |
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
| 10247 |
|
|
|
|
| 10248 |
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
| 10249 |
-
|
| 10250 |
}
|
|
|
|
| 10251 |
}
|
| 10252 |
}
|
| 10253 |
-
float mean = sum / (ne00 * ne01 * step);
|
| 10254 |
-
ggml_float sum2 = 0.0;
|
| 10255 |
|
|
|
|
| 10256 |
for (int64_t i02 = start; i02 < end; i02++) {
|
| 10257 |
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
| 10258 |
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
| 10259 |
|
| 10260 |
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
| 10261 |
|
|
|
|
| 10262 |
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
| 10263 |
float v = x[i00] - mean;
|
| 10264 |
y[i00] = v;
|
| 10265 |
-
|
| 10266 |
}
|
|
|
|
| 10267 |
}
|
| 10268 |
}
|
| 10269 |
-
float variance = sum2 / (ne00 * ne01 * step);
|
| 10270 |
const float scale = 1.0f / sqrtf(variance + eps);
|
| 10271 |
|
| 10272 |
for (int64_t i02 = start; i02 < end; i02++) {
|
|
@@ -13547,6 +13614,106 @@ static void ggml_compute_forward_pad(
|
|
| 13547 |
}
|
| 13548 |
}
|
| 13549 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13550 |
// ggml_compute_forward_argsort
|
| 13551 |
|
| 13552 |
static void ggml_compute_forward_argsort_f32(
|
|
@@ -15615,6 +15782,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 15615 |
{
|
| 15616 |
ggml_compute_forward_pad(params, tensor);
|
| 15617 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15618 |
case GGML_OP_ARGSORT:
|
| 15619 |
{
|
| 15620 |
ggml_compute_forward_argsort(params, tensor);
|
|
@@ -16617,6 +16792,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
| 16617 |
{
|
| 16618 |
GGML_ASSERT(false); // TODO: not implemented
|
| 16619 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16620 |
case GGML_OP_ARGSORT:
|
| 16621 |
{
|
| 16622 |
GGML_ASSERT(false); // TODO: not implemented
|
|
@@ -17368,6 +17551,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
| 17368 |
{
|
| 17369 |
n_tasks = n_threads;
|
| 17370 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17371 |
case GGML_OP_ARGSORT:
|
| 17372 |
{
|
| 17373 |
n_tasks = n_threads;
|
|
|
|
| 1822 |
"POOL_2D",
|
| 1823 |
"UPSCALE",
|
| 1824 |
"PAD",
|
| 1825 |
+
"ARANGE",
|
| 1826 |
+
"TIMESTEP_EMBEDDING",
|
| 1827 |
"ARGSORT",
|
| 1828 |
"LEAKY_RELU",
|
| 1829 |
|
|
|
|
| 1852 |
"CROSS_ENTROPY_LOSS_BACK",
|
| 1853 |
};
|
| 1854 |
|
| 1855 |
+
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
| 1856 |
|
| 1857 |
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
| 1858 |
"none",
|
|
|
|
| 1910 |
"pool_2d(x)",
|
| 1911 |
"upscale(x)",
|
| 1912 |
"pad(x)",
|
| 1913 |
+
"arange(start, stop, step)",
|
| 1914 |
+
"timestep_embedding(timesteps, dim, max_period)",
|
| 1915 |
"argsort(x)",
|
| 1916 |
"leaky_relu(x)",
|
| 1917 |
|
|
|
|
| 1940 |
"cross_entropy_loss_back(x,y)",
|
| 1941 |
};
|
| 1942 |
|
| 1943 |
+
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
| 1944 |
|
| 1945 |
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
| 1946 |
|
|
|
|
| 2899 |
return ((const int32_t *)(tensor->op_params))[i];
|
| 2900 |
}
|
| 2901 |
|
| 2902 |
+
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
| 2903 |
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
| 2904 |
+
return ((const float *)(tensor->op_params))[i];
|
| 2905 |
+
}
|
| 2906 |
+
|
| 2907 |
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
| 2908 |
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
| 2909 |
((int32_t *)(tensor->op_params))[i] = value;
|
| 2910 |
}
|
| 2911 |
|
| 2912 |
+
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
| 2913 |
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
| 2914 |
+
((float *)(tensor->op_params))[i] = value;
|
| 2915 |
+
}
|
| 2916 |
+
|
| 2917 |
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
| 2918 |
memset(tensor->data, 0, ggml_nbytes(tensor));
|
| 2919 |
return tensor;
|
|
|
|
| 5912 |
return ggml_upscale_impl(ctx, a, scale_factor);
|
| 5913 |
}
|
| 5914 |
|
| 5915 |
+
struct ggml_tensor * ggml_arange(
|
| 5916 |
+
struct ggml_context * ctx,
|
| 5917 |
+
float start,
|
| 5918 |
+
float stop,
|
| 5919 |
+
float step) {
|
| 5920 |
+
|
| 5921 |
+
GGML_ASSERT(stop > start);
|
| 5922 |
+
|
| 5923 |
+
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
| 5924 |
+
|
| 5925 |
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
|
| 5926 |
+
|
| 5927 |
+
result->op = GGML_OP_ARANGE;
|
| 5928 |
+
ggml_set_op_params_f32(result, 0, start);
|
| 5929 |
+
ggml_set_op_params_f32(result, 1, stop);
|
| 5930 |
+
ggml_set_op_params_f32(result, 2, step);
|
| 5931 |
+
|
| 5932 |
+
return result;
|
| 5933 |
+
}
|
| 5934 |
+
|
| 5935 |
+
struct ggml_tensor * ggml_timestep_embedding(
|
| 5936 |
+
struct ggml_context * ctx,
|
| 5937 |
+
struct ggml_tensor * timesteps,
|
| 5938 |
+
int dim,
|
| 5939 |
+
int max_period) {
|
| 5940 |
+
bool is_node = false;
|
| 5941 |
+
|
| 5942 |
+
if (timesteps->grad) {
|
| 5943 |
+
GGML_ASSERT(false); // TODO: implement backward
|
| 5944 |
+
is_node = true;
|
| 5945 |
+
}
|
| 5946 |
+
|
| 5947 |
+
int actual_dim = dim;
|
| 5948 |
+
if (dim % 2 != 0) {
|
| 5949 |
+
actual_dim = dim + 1;
|
| 5950 |
+
}
|
| 5951 |
+
|
| 5952 |
+
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
|
| 5953 |
+
|
| 5954 |
+
result->op = GGML_OP_TIMESTEP_EMBEDDING;
|
| 5955 |
+
ggml_set_op_params_i32(result, 0, dim);
|
| 5956 |
+
ggml_set_op_params_i32(result, 1, max_period);
|
| 5957 |
+
|
| 5958 |
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 5959 |
+
result->src[0] = timesteps;
|
| 5960 |
+
|
| 5961 |
+
return result;
|
| 5962 |
+
}
|
| 5963 |
+
|
| 5964 |
// ggml_argsort
|
| 5965 |
|
| 5966 |
struct ggml_tensor * ggml_argsort(
|
|
|
|
| 10294 |
int n_channels = src0->ne[2];
|
| 10295 |
int n_groups = dst->op_params[0];
|
| 10296 |
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
| 10297 |
+
for (int i = ith; i < n_groups; i += nth) {
|
| 10298 |
int start = i * n_channels_per_group;
|
| 10299 |
int end = start + n_channels_per_group;
|
| 10300 |
if (end > n_channels) {
|
|
|
|
| 10308 |
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
| 10309 |
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
| 10310 |
|
| 10311 |
+
ggml_float sumr = 0.0;
|
| 10312 |
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
| 10313 |
+
sumr += (ggml_float)x[i00];
|
| 10314 |
}
|
| 10315 |
+
sum += sumr;
|
| 10316 |
}
|
| 10317 |
}
|
| 10318 |
+
const float mean = sum / (ne00 * ne01 * step);
|
|
|
|
| 10319 |
|
| 10320 |
+
ggml_float sum2 = 0.0;
|
| 10321 |
for (int64_t i02 = start; i02 < end; i02++) {
|
| 10322 |
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
| 10323 |
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
| 10324 |
|
| 10325 |
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
| 10326 |
|
| 10327 |
+
ggml_float sumr = 0.0;
|
| 10328 |
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
| 10329 |
float v = x[i00] - mean;
|
| 10330 |
y[i00] = v;
|
| 10331 |
+
sumr += (ggml_float)(v * v);
|
| 10332 |
}
|
| 10333 |
+
sum2 += sumr;
|
| 10334 |
}
|
| 10335 |
}
|
| 10336 |
+
const float variance = sum2 / (ne00 * ne01 * step);
|
| 10337 |
const float scale = 1.0f / sqrtf(variance + eps);
|
| 10338 |
|
| 10339 |
for (int64_t i02 = start; i02 < end; i02++) {
|
|
|
|
| 13614 |
}
|
| 13615 |
}
|
| 13616 |
|
| 13617 |
+
|
| 13618 |
+
// ggml_compute_forward_arange
|
| 13619 |
+
|
| 13620 |
+
static void ggml_compute_forward_arange_f32(
|
| 13621 |
+
const struct ggml_compute_params * params,
|
| 13622 |
+
struct ggml_tensor * dst) {
|
| 13623 |
+
|
| 13624 |
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
| 13625 |
+
return;
|
| 13626 |
+
}
|
| 13627 |
+
|
| 13628 |
+
GGML_ASSERT(dst->nb[0] == sizeof(float));
|
| 13629 |
+
|
| 13630 |
+
const int ith = params->ith;
|
| 13631 |
+
const int nth = params->nth;
|
| 13632 |
+
|
| 13633 |
+
const float start = ggml_get_op_params_f32(dst, 0);
|
| 13634 |
+
const float stop = ggml_get_op_params_f32(dst, 1);
|
| 13635 |
+
const float step = ggml_get_op_params_f32(dst, 2);
|
| 13636 |
+
|
| 13637 |
+
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
| 13638 |
+
|
| 13639 |
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
| 13640 |
+
|
| 13641 |
+
for (int64_t i = ith; i < steps; i+= nth) {
|
| 13642 |
+
float value = start + step * i;
|
| 13643 |
+
((float *)dst->data)[i] = value;
|
| 13644 |
+
}
|
| 13645 |
+
}
|
| 13646 |
+
|
| 13647 |
+
static void ggml_compute_forward_arange(
|
| 13648 |
+
const struct ggml_compute_params * params,
|
| 13649 |
+
struct ggml_tensor * dst) {
|
| 13650 |
+
switch (dst->type) {
|
| 13651 |
+
case GGML_TYPE_F32:
|
| 13652 |
+
{
|
| 13653 |
+
ggml_compute_forward_arange_f32(params, dst);
|
| 13654 |
+
} break;
|
| 13655 |
+
default:
|
| 13656 |
+
{
|
| 13657 |
+
GGML_ASSERT(false);
|
| 13658 |
+
} break;
|
| 13659 |
+
}
|
| 13660 |
+
}
|
| 13661 |
+
|
| 13662 |
+
static void ggml_compute_forward_timestep_embedding_f32(
|
| 13663 |
+
const struct ggml_compute_params * params,
|
| 13664 |
+
struct ggml_tensor * dst) {
|
| 13665 |
+
|
| 13666 |
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
| 13667 |
+
return;
|
| 13668 |
+
}
|
| 13669 |
+
|
| 13670 |
+
const struct ggml_tensor * src0 = dst->src[0];
|
| 13671 |
+
|
| 13672 |
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
| 13673 |
+
|
| 13674 |
+
const int ith = params->ith;
|
| 13675 |
+
const int nth = params->nth;
|
| 13676 |
+
|
| 13677 |
+
GGML_TENSOR_UNARY_OP_LOCALS
|
| 13678 |
+
|
| 13679 |
+
const int dim = ggml_get_op_params_i32(dst, 0);
|
| 13680 |
+
const int max_period = ggml_get_op_params_i32(dst, 1);
|
| 13681 |
+
|
| 13682 |
+
int half = dim / 2;
|
| 13683 |
+
|
| 13684 |
+
for (int64_t i = 0; i < ne00; i++) {
|
| 13685 |
+
float * embed_data = (float *)((char *) dst->data + i*nb1);
|
| 13686 |
+
for (int64_t j = ith; j < half; j += nth) {
|
| 13687 |
+
float timestep = ((float *)src0->data)[i];
|
| 13688 |
+
float freq = (float)expf(-logf(max_period) * j / half);
|
| 13689 |
+
float arg = timestep * freq;
|
| 13690 |
+
embed_data[j] = cosf(arg);
|
| 13691 |
+
embed_data[j + half] = sinf(arg);
|
| 13692 |
+
}
|
| 13693 |
+
if (dim % 2 != 0 && ith == 0) {
|
| 13694 |
+
embed_data[dim] = 0.f;
|
| 13695 |
+
}
|
| 13696 |
+
}
|
| 13697 |
+
}
|
| 13698 |
+
|
| 13699 |
+
static void ggml_compute_forward_timestep_embedding(
|
| 13700 |
+
const struct ggml_compute_params * params,
|
| 13701 |
+
struct ggml_tensor * dst) {
|
| 13702 |
+
|
| 13703 |
+
const struct ggml_tensor * src0 = dst->src[0];
|
| 13704 |
+
|
| 13705 |
+
switch (src0->type) {
|
| 13706 |
+
case GGML_TYPE_F32:
|
| 13707 |
+
{
|
| 13708 |
+
ggml_compute_forward_timestep_embedding_f32(params, dst);
|
| 13709 |
+
} break;
|
| 13710 |
+
default:
|
| 13711 |
+
{
|
| 13712 |
+
GGML_ASSERT(false);
|
| 13713 |
+
} break;
|
| 13714 |
+
}
|
| 13715 |
+
}
|
| 13716 |
+
|
| 13717 |
// ggml_compute_forward_argsort
|
| 13718 |
|
| 13719 |
static void ggml_compute_forward_argsort_f32(
|
|
|
|
| 15782 |
{
|
| 15783 |
ggml_compute_forward_pad(params, tensor);
|
| 15784 |
} break;
|
| 15785 |
+
case GGML_OP_ARANGE:
|
| 15786 |
+
{
|
| 15787 |
+
ggml_compute_forward_arange(params, tensor);
|
| 15788 |
+
} break;
|
| 15789 |
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 15790 |
+
{
|
| 15791 |
+
ggml_compute_forward_timestep_embedding(params, tensor);
|
| 15792 |
+
} break;
|
| 15793 |
case GGML_OP_ARGSORT:
|
| 15794 |
{
|
| 15795 |
ggml_compute_forward_argsort(params, tensor);
|
|
|
|
| 16792 |
{
|
| 16793 |
GGML_ASSERT(false); // TODO: not implemented
|
| 16794 |
} break;
|
| 16795 |
+
case GGML_OP_ARANGE:
|
| 16796 |
+
{
|
| 16797 |
+
GGML_ASSERT(false); // TODO: not implemented
|
| 16798 |
+
} break;
|
| 16799 |
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 16800 |
+
{
|
| 16801 |
+
GGML_ASSERT(false); // TODO: not implemented
|
| 16802 |
+
} break;
|
| 16803 |
case GGML_OP_ARGSORT:
|
| 16804 |
{
|
| 16805 |
GGML_ASSERT(false); // TODO: not implemented
|
|
|
|
| 17551 |
{
|
| 17552 |
n_tasks = n_threads;
|
| 17553 |
} break;
|
| 17554 |
+
case GGML_OP_ARANGE:
|
| 17555 |
+
{
|
| 17556 |
+
n_tasks = n_threads;
|
| 17557 |
+
} break;
|
| 17558 |
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 17559 |
+
{
|
| 17560 |
+
n_tasks = n_threads;
|
| 17561 |
+
} break;
|
| 17562 |
case GGML_OP_ARGSORT:
|
| 17563 |
{
|
| 17564 |
n_tasks = n_threads;
|
|
@@ -454,6 +454,8 @@ extern "C" {
|
|
| 454 |
GGML_OP_POOL_2D,
|
| 455 |
GGML_OP_UPSCALE, // nearest interpolate
|
| 456 |
GGML_OP_PAD,
|
|
|
|
|
|
|
| 457 |
GGML_OP_ARGSORT,
|
| 458 |
GGML_OP_LEAKY_RELU,
|
| 459 |
|
|
@@ -1661,6 +1663,15 @@ extern "C" {
|
|
| 1661 |
int p2,
|
| 1662 |
int p3);
|
| 1663 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1664 |
// sort rows
|
| 1665 |
enum ggml_sort_order {
|
| 1666 |
GGML_SORT_ORDER_ASC,
|
|
@@ -1672,6 +1683,12 @@ extern "C" {
|
|
| 1672 |
struct ggml_tensor * a,
|
| 1673 |
enum ggml_sort_order order);
|
| 1674 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1675 |
// top k elements per row
|
| 1676 |
GGML_API struct ggml_tensor * ggml_top_k(
|
| 1677 |
struct ggml_context * ctx,
|
|
|
|
| 454 |
GGML_OP_POOL_2D,
|
| 455 |
GGML_OP_UPSCALE, // nearest interpolate
|
| 456 |
GGML_OP_PAD,
|
| 457 |
+
GGML_OP_ARANGE,
|
| 458 |
+
GGML_OP_TIMESTEP_EMBEDDING,
|
| 459 |
GGML_OP_ARGSORT,
|
| 460 |
GGML_OP_LEAKY_RELU,
|
| 461 |
|
|
|
|
| 1663 |
int p2,
|
| 1664 |
int p3);
|
| 1665 |
|
| 1666 |
+
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
| 1667 |
+
// timesteps: [N,]
|
| 1668 |
+
// return: [N, dim]
|
| 1669 |
+
GGML_API struct ggml_tensor * ggml_timestep_embedding(
|
| 1670 |
+
struct ggml_context * ctx,
|
| 1671 |
+
struct ggml_tensor * timesteps,
|
| 1672 |
+
int dim,
|
| 1673 |
+
int max_period);
|
| 1674 |
+
|
| 1675 |
// sort rows
|
| 1676 |
enum ggml_sort_order {
|
| 1677 |
GGML_SORT_ORDER_ASC,
|
|
|
|
| 1683 |
struct ggml_tensor * a,
|
| 1684 |
enum ggml_sort_order order);
|
| 1685 |
|
| 1686 |
+
GGML_API struct ggml_tensor * ggml_arange(
|
| 1687 |
+
struct ggml_context * ctx,
|
| 1688 |
+
float start,
|
| 1689 |
+
float stop,
|
| 1690 |
+
float step);
|
| 1691 |
+
|
| 1692 |
// top k elements per row
|
| 1693 |
GGML_API struct ggml_tensor * ggml_top_k(
|
| 1694 |
struct ggml_context * ctx,
|