Spaces:
Sleeping
Sleeping
sync : ggml (CUDA faster rope)
Browse files- ggml-cuda.cu +14 -18
ggml-cuda.cu
CHANGED
|
@@ -4086,7 +4086,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
| 4086 |
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
| 4087 |
}
|
| 4088 |
|
| 4089 |
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
|
|
|
| 4090 |
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
| 4091 |
const int half_n_dims = ncols/4;
|
| 4092 |
|
|
@@ -4098,8 +4099,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
| 4098 |
const int i = row*ncols + col;
|
| 4099 |
|
| 4100 |
const float col_theta_scale = powf(theta_scale, col);
|
|
|
|
| 4101 |
|
| 4102 |
-
const float theta = p*col_theta_scale;
|
| 4103 |
const float sin_theta = sinf(theta);
|
| 4104 |
const float cos_theta = cosf(theta);
|
| 4105 |
|
|
@@ -4109,7 +4111,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
| 4109 |
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
| 4110 |
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
| 4111 |
|
| 4112 |
-
const float block_theta =
|
| 4113 |
const float sin_block_theta = sinf(block_theta);
|
| 4114 |
const float cos_block_theta = cosf(block_theta);
|
| 4115 |
|
|
@@ -4984,12 +4986,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
|
|
| 4984 |
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
| 4985 |
}
|
| 4986 |
|
| 4987 |
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
| 4988 |
-
|
| 4989 |
-
|
| 4990 |
-
const
|
|
|
|
| 4991 |
const dim3 block_nums(num_blocks_x, nrows, 1);
|
| 4992 |
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
| 4993 |
}
|
| 4994 |
|
| 4995 |
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
|
@@ -5723,22 +5726,18 @@ inline void ggml_cuda_op_rope(
|
|
| 5723 |
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
| 5724 |
|
| 5725 |
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
|
|
| 5726 |
|
| 5727 |
const bool is_neox = mode & 2;
|
| 5728 |
const bool is_glm = mode & 4;
|
| 5729 |
|
| 5730 |
// compute
|
| 5731 |
if (is_glm) {
|
| 5732 |
-
|
| 5733 |
-
const float id_p = min(p, n_ctx - 2.f);
|
| 5734 |
-
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
| 5735 |
-
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
| 5736 |
} else if (is_neox) {
|
| 5737 |
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
| 5738 |
-
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
| 5739 |
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
| 5740 |
} else {
|
| 5741 |
-
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
| 5742 |
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
| 5743 |
}
|
| 5744 |
|
|
@@ -6400,10 +6399,7 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
|
|
| 6400 |
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
| 6401 |
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
| 6402 |
|
| 6403 |
-
|
| 6404 |
-
const bool is_glm = mode & 4;
|
| 6405 |
-
|
| 6406 |
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
| 6407 |
}
|
| 6408 |
|
| 6409 |
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 4086 |
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
| 4087 |
}
|
| 4088 |
|
| 4089 |
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
|
| 4090 |
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
| 4091 |
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
| 4092 |
const int half_n_dims = ncols/4;
|
| 4093 |
|
|
|
|
| 4099 |
const int i = row*ncols + col;
|
| 4100 |
|
| 4101 |
const float col_theta_scale = powf(theta_scale, col);
|
| 4102 |
+
const float p = p0 + p_delta*(row/p_delta_rows);
|
| 4103 |
|
| 4104 |
+
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
|
| 4105 |
const float sin_theta = sinf(theta);
|
| 4106 |
const float cos_theta = cosf(theta);
|
| 4107 |
|
|
|
|
| 4111 |
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
| 4112 |
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
| 4113 |
|
| 4114 |
+
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
|
| 4115 |
const float sin_block_theta = sinf(block_theta);
|
| 4116 |
const float cos_block_theta = cosf(block_theta);
|
| 4117 |
|
|
|
|
| 4986 |
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
| 4987 |
}
|
| 4988 |
|
| 4989 |
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
| 4990 |
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
| 4991 |
+
GGML_ASSERT(ncols % 4 == 0);
|
| 4992 |
+
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
| 4993 |
+
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
| 4994 |
const dim3 block_nums(num_blocks_x, nrows, 1);
|
| 4995 |
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
|
| 4996 |
}
|
| 4997 |
|
| 4998 |
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
|
|
|
| 5726 |
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
| 5727 |
|
| 5728 |
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
| 5729 |
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
| 5730 |
|
| 5731 |
const bool is_neox = mode & 2;
|
| 5732 |
const bool is_glm = mode & 4;
|
| 5733 |
|
| 5734 |
// compute
|
| 5735 |
if (is_glm) {
|
| 5736 |
+
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, n_ctx, cudaStream_main);
|
|
|
|
|
|
|
|
|
|
| 5737 |
} else if (is_neox) {
|
| 5738 |
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
|
|
|
| 5739 |
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
| 5740 |
} else {
|
|
|
|
| 5741 |
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
| 5742 |
}
|
| 5743 |
|
|
|
|
| 6399 |
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
| 6400 |
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
| 6401 |
|
| 6402 |
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, true);
|
|
|
|
|
|
|
|
|
|
| 6403 |
}
|
| 6404 |
|
| 6405 |
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|