Spaces:
Running
Running
CUDA: faster dequantize kernels for Q4_0 and Q4_1 (llama/4938)
Browse files- ggml-cuda.cu +73 -4
ggml-cuda.cu
CHANGED
|
@@ -1105,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
| 1105 |
#endif // GGML_CUDA_F16
|
| 1106 |
}
|
| 1107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1108 |
//================================== k-quants
|
| 1109 |
|
| 1110 |
template<typename dst_t>
|
|
@@ -6253,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
| 6253 |
#endif
|
| 6254 |
}
|
| 6255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6256 |
template<typename dst_t>
|
| 6257 |
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 6258 |
const int nb = k / QK_K;
|
|
@@ -6301,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
| 6301 |
int id;
|
| 6302 |
switch (type) {
|
| 6303 |
case GGML_TYPE_Q4_0:
|
| 6304 |
-
return
|
| 6305 |
case GGML_TYPE_Q4_1:
|
| 6306 |
-
return
|
| 6307 |
case GGML_TYPE_Q5_0:
|
| 6308 |
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 6309 |
case GGML_TYPE_Q5_1:
|
|
@@ -6338,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
| 6338 |
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
| 6339 |
switch (type) {
|
| 6340 |
case GGML_TYPE_Q4_0:
|
| 6341 |
-
return
|
| 6342 |
case GGML_TYPE_Q4_1:
|
| 6343 |
-
return
|
| 6344 |
case GGML_TYPE_Q5_0:
|
| 6345 |
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 6346 |
case GGML_TYPE_Q5_1:
|
|
|
|
| 1105 |
#endif // GGML_CUDA_F16
|
| 1106 |
}
|
| 1107 |
|
| 1108 |
+
template<typename dst_t>
|
| 1109 |
+
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
| 1110 |
+
|
| 1111 |
+
const int i = blockIdx.x;
|
| 1112 |
+
|
| 1113 |
+
// assume 32 threads
|
| 1114 |
+
const int tid = threadIdx.x;
|
| 1115 |
+
const int il = tid/8;
|
| 1116 |
+
const int ir = tid%8;
|
| 1117 |
+
const int ib = 8*i + ir;
|
| 1118 |
+
if (ib >= nb32) {
|
| 1119 |
+
return;
|
| 1120 |
+
}
|
| 1121 |
+
|
| 1122 |
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
| 1123 |
+
|
| 1124 |
+
const block_q4_0 * x = (const block_q4_0 *)vx + ib;
|
| 1125 |
+
const float d = __half2float(x->d);
|
| 1126 |
+
const float dm = -8*d;
|
| 1127 |
+
|
| 1128 |
+
const uint8_t * q = x->qs + 4*il;
|
| 1129 |
+
|
| 1130 |
+
for (int l = 0; l < 4; ++l) {
|
| 1131 |
+
y[l+ 0] = d * (q[l] & 0xF) + dm;
|
| 1132 |
+
y[l+16] = d * (q[l] >> 4) + dm;
|
| 1133 |
+
}
|
| 1134 |
+
}
|
| 1135 |
+
|
| 1136 |
+
template<typename dst_t>
|
| 1137 |
+
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
| 1138 |
+
|
| 1139 |
+
const int i = blockIdx.x;
|
| 1140 |
+
|
| 1141 |
+
// assume 32 threads
|
| 1142 |
+
const int tid = threadIdx.x;
|
| 1143 |
+
const int il = tid/8;
|
| 1144 |
+
const int ir = tid%8;
|
| 1145 |
+
const int ib = 8*i + ir;
|
| 1146 |
+
if (ib >= nb32) {
|
| 1147 |
+
return;
|
| 1148 |
+
}
|
| 1149 |
+
|
| 1150 |
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
| 1151 |
+
|
| 1152 |
+
const block_q4_1 * x = (const block_q4_1 *)vx + ib;
|
| 1153 |
+
const float2 d = __half22float2(x->dm);
|
| 1154 |
+
|
| 1155 |
+
const uint8_t * q = x->qs + 4*il;
|
| 1156 |
+
|
| 1157 |
+
for (int l = 0; l < 4; ++l) {
|
| 1158 |
+
y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
|
| 1159 |
+
y[l+16] = d.x * (q[l] >> 4) + d.y;
|
| 1160 |
+
}
|
| 1161 |
+
}
|
| 1162 |
+
|
| 1163 |
//================================== k-quants
|
| 1164 |
|
| 1165 |
template<typename dst_t>
|
|
|
|
| 6308 |
#endif
|
| 6309 |
}
|
| 6310 |
|
| 6311 |
+
template<typename dst_t>
|
| 6312 |
+
static void dequantize_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 6313 |
+
const int nb32 = k / 32;
|
| 6314 |
+
const int nb = (k + 255) / 256;
|
| 6315 |
+
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 6316 |
+
}
|
| 6317 |
+
|
| 6318 |
+
template<typename dst_t>
|
| 6319 |
+
static void dequantize_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 6320 |
+
const int nb32 = k / 32;
|
| 6321 |
+
const int nb = (k + 255) / 256;
|
| 6322 |
+
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
| 6323 |
+
}
|
| 6324 |
+
|
| 6325 |
template<typename dst_t>
|
| 6326 |
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
| 6327 |
const int nb = k / QK_K;
|
|
|
|
| 6370 |
int id;
|
| 6371 |
switch (type) {
|
| 6372 |
case GGML_TYPE_Q4_0:
|
| 6373 |
+
return dequantize_q4_0_cuda;
|
| 6374 |
case GGML_TYPE_Q4_1:
|
| 6375 |
+
return dequantize_q4_1_cuda;
|
| 6376 |
case GGML_TYPE_Q5_0:
|
| 6377 |
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 6378 |
case GGML_TYPE_Q5_1:
|
|
|
|
| 6407 |
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
| 6408 |
switch (type) {
|
| 6409 |
case GGML_TYPE_Q4_0:
|
| 6410 |
+
return dequantize_q4_0_cuda;
|
| 6411 |
case GGML_TYPE_Q4_1:
|
| 6412 |
+
return dequantize_q4_1_cuda;
|
| 6413 |
case GGML_TYPE_Q5_0:
|
| 6414 |
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
| 6415 |
case GGML_TYPE_Q5_1:
|