Spaces:
Running
Running
CUDA: faster q8_0 -> f16 dequantization (llama/4895)
Browse files- ggml-cuda.cu +57 -0
ggml-cuda.cu
CHANGED
|
@@ -523,6 +523,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
|
|
| 523 |
#define CUDA_ACC_BLOCK_SIZE 256
|
| 524 |
#define CUDA_IM2COL_BLOCK_SIZE 256
|
| 525 |
|
|
|
|
|
|
|
| 526 |
// dmmv = dequantize_mul_mat_vec
|
| 527 |
#ifndef GGML_CUDA_DMMV_X
|
| 528 |
#define GGML_CUDA_DMMV_X 32
|
|
@@ -2327,6 +2329,45 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
|
| 2327 |
y[i] = x[i];
|
| 2328 |
}
|
| 2329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2330 |
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
| 2331 |
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
| 2332 |
|
|
@@ -6181,6 +6222,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri
|
|
| 6181 |
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 6182 |
}
|
| 6183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6184 |
template<typename dst_t>
|
| 6185 |
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 6186 |
const int nb = k / QK_K;
|
|
@@ -6246,6 +6298,7 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
|
|
| 6246 |
}
|
| 6247 |
|
| 6248 |
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
|
|
| 6249 |
switch (type) {
|
| 6250 |
case GGML_TYPE_Q4_0:
|
| 6251 |
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
|
@@ -6256,6 +6309,10 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
| 6256 |
case GGML_TYPE_Q5_1:
|
| 6257 |
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
| 6258 |
case GGML_TYPE_Q8_0:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6259 |
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
| 6260 |
case GGML_TYPE_Q2_K:
|
| 6261 |
return dequantize_row_q2_K_cuda;
|
|
|
|
| 523 |
#define CUDA_ACC_BLOCK_SIZE 256
|
| 524 |
#define CUDA_IM2COL_BLOCK_SIZE 256
|
| 525 |
|
| 526 |
+
#define CUDA_Q8_0_NE_ALIGN 2048
|
| 527 |
+
|
| 528 |
// dmmv = dequantize_mul_mat_vec
|
| 529 |
#ifndef GGML_CUDA_DMMV_X
|
| 530 |
#define GGML_CUDA_DMMV_X 32
|
|
|
|
| 2329 |
y[i] = x[i];
|
| 2330 |
}
|
| 2331 |
|
| 2332 |
+
template <bool need_check>
|
| 2333 |
+
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
| 2334 |
+
#if __CUDA_ARCH__ >= CC_PASCAL
|
| 2335 |
+
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
| 2336 |
+
|
| 2337 |
+
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
| 2338 |
+
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
| 2339 |
+
half2 * y2 = (half2 *) (y + i0);
|
| 2340 |
+
|
| 2341 |
+
__shared__ int vals[nint];
|
| 2342 |
+
|
| 2343 |
+
#pragma unroll
|
| 2344 |
+
for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
|
| 2345 |
+
if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
|
| 2346 |
+
break;
|
| 2347 |
+
}
|
| 2348 |
+
|
| 2349 |
+
const int ix = ix0 + threadIdx.x;
|
| 2350 |
+
vals[ix] = x0[ix];
|
| 2351 |
+
}
|
| 2352 |
+
|
| 2353 |
+
#pragma unroll
|
| 2354 |
+
for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
|
| 2355 |
+
if (need_check && i0 + iy + 2*threadIdx.x >= k) {
|
| 2356 |
+
return;
|
| 2357 |
+
}
|
| 2358 |
+
|
| 2359 |
+
const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
|
| 2360 |
+
const half d = *b0;
|
| 2361 |
+
const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
|
| 2362 |
+
|
| 2363 |
+
y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
|
| 2364 |
+
}
|
| 2365 |
+
#else
|
| 2366 |
+
(void) vx; (void) y; (void) k;
|
| 2367 |
+
bad_arch();
|
| 2368 |
+
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
| 2369 |
+
}
|
| 2370 |
+
|
| 2371 |
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
| 2372 |
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
| 2373 |
|
|
|
|
| 6222 |
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 6223 |
}
|
| 6224 |
|
| 6225 |
+
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
| 6226 |
+
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
| 6227 |
+
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
| 6228 |
+
const bool need_check = false;
|
| 6229 |
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
| 6230 |
+
} else {
|
| 6231 |
+
const bool need_check = true;
|
| 6232 |
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
| 6233 |
+
}
|
| 6234 |
+
}
|
| 6235 |
+
|
| 6236 |
template<typename dst_t>
|
| 6237 |
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 6238 |
const int nb = k / QK_K;
|
|
|
|
| 6298 |
}
|
| 6299 |
|
| 6300 |
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
| 6301 |
+
int id;
|
| 6302 |
switch (type) {
|
| 6303 |
case GGML_TYPE_Q4_0:
|
| 6304 |
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
|
|
|
| 6309 |
case GGML_TYPE_Q5_1:
|
| 6310 |
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
| 6311 |
case GGML_TYPE_Q8_0:
|
| 6312 |
+
CUDA_CHECK(cudaGetDevice(&id));
|
| 6313 |
+
if (g_device_caps[id].cc >= CC_PASCAL) {
|
| 6314 |
+
return dequantize_block_q8_0_f16_cuda;
|
| 6315 |
+
}
|
| 6316 |
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
| 6317 |
case GGML_TYPE_Q2_K:
|
| 6318 |
return dequantize_row_q2_K_cuda;
|