Spaces:
Running
Running
Neo Zhang
commited on
rm wait() (llama/7233)
Browse files- ggml-sycl.cpp +1 -24
ggml-sycl.cpp
CHANGED
|
@@ -15564,26 +15564,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
| 15564 |
const int64_t r2 = ne12/ne02;
|
| 15565 |
const int64_t r3 = ne13/ne03;
|
| 15566 |
|
| 15567 |
-
#if 0
|
| 15568 |
-
// use syclGemmEx
|
| 15569 |
-
{
|
| 15570 |
-
for (int i13 = 0; i13 < ne13; ++i13) {
|
| 15571 |
-
for (int i12 = 0; i12 < ne12; ++i12) {
|
| 15572 |
-
int i03 = i13 / r3;
|
| 15573 |
-
int i02 = i12 / r2;
|
| 15574 |
-
|
| 15575 |
-
SYCL_CHECK(
|
| 15576 |
-
syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 15577 |
-
ne01, ne11, ne10,
|
| 15578 |
-
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
|
| 15579 |
-
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
|
| 15580 |
-
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
| 15581 |
-
cu_compute_type,
|
| 15582 |
-
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 15583 |
-
}
|
| 15584 |
-
}
|
| 15585 |
-
}
|
| 15586 |
-
#else
|
| 15587 |
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
| 15588 |
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
| 15589 |
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
|
@@ -15595,7 +15575,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
| 15595 |
nb11 / nb10, nb12 / nb10, beta,
|
| 15596 |
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
| 15597 |
ne12 * ne13, cu_compute_type)));
|
| 15598 |
-
g_sycl_handles[g_main_device]->wait();
|
| 15599 |
} else {
|
| 15600 |
const int ne23 = ne12*ne13;
|
| 15601 |
|
|
@@ -15626,7 +15605,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
| 15626 |
nb02, nb03, nb12_scaled, nb13_scaled,
|
| 15627 |
nbd2, nbd3, r2, r3, item_ct1);
|
| 15628 |
});
|
| 15629 |
-
})
|
| 15630 |
}
|
| 15631 |
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
| 15632 |
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
|
@@ -15637,9 +15616,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
| 15637 |
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
| 15638 |
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
| 15639 |
cu_compute_type)));
|
| 15640 |
-
g_sycl_handles[g_main_device]->wait();
|
| 15641 |
}
|
| 15642 |
-
#endif
|
| 15643 |
|
| 15644 |
if (no_mixed_dtypes) {
|
| 15645 |
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
|
|
|
| 15564 |
const int64_t r2 = ne12/ne02;
|
| 15565 |
const int64_t r3 = ne13/ne03;
|
| 15566 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15567 |
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
| 15568 |
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
| 15569 |
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
|
|
|
| 15575 |
nb11 / nb10, nb12 / nb10, beta,
|
| 15576 |
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
| 15577 |
ne12 * ne13, cu_compute_type)));
|
|
|
|
| 15578 |
} else {
|
| 15579 |
const int ne23 = ne12*ne13;
|
| 15580 |
|
|
|
|
| 15605 |
nb02, nb03, nb12_scaled, nb13_scaled,
|
| 15606 |
nbd2, nbd3, r2, r3, item_ct1);
|
| 15607 |
});
|
| 15608 |
+
});
|
| 15609 |
}
|
| 15610 |
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
| 15611 |
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
|
|
|
| 15616 |
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
| 15617 |
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
| 15618 |
cu_compute_type)));
|
|
|
|
| 15619 |
}
|
|
|
|
| 15620 |
|
| 15621 |
if (no_mixed_dtypes) {
|
| 15622 |
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|