Spaces:
Running
Running
CUDA: more info when no device code (llama/5088)
Browse files- ggml-cuda.cu +54 -35
ggml-cuda.cu
CHANGED
|
@@ -13,6 +13,10 @@
|
|
| 13 |
#include <map>
|
| 14 |
#include <array>
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
#if defined(GGML_USE_HIPBLAS)
|
| 17 |
#include <hip/hip_runtime.h>
|
| 18 |
#include <hipblas/hipblas.h>
|
|
@@ -584,13 +588,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
|
|
| 584 |
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
| 585 |
|
| 586 |
[[noreturn]]
|
| 587 |
-
static __device__ void
|
| 588 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
__trap();
|
| 590 |
|
| 591 |
-
(void)
|
| 592 |
}
|
| 593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
| 595 |
#pragma unroll
|
| 596 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
@@ -617,7 +636,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
| 617 |
return a;
|
| 618 |
#else
|
| 619 |
(void) a;
|
| 620 |
-
|
| 621 |
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
| 622 |
}
|
| 623 |
|
|
@@ -638,7 +657,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
| 638 |
return x;
|
| 639 |
#else
|
| 640 |
(void) x;
|
| 641 |
-
|
| 642 |
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 643 |
}
|
| 644 |
|
|
@@ -2421,7 +2440,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
|
| 2421 |
}
|
| 2422 |
#else
|
| 2423 |
(void) vx; (void) y; (void) k;
|
| 2424 |
-
|
| 2425 |
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
| 2426 |
}
|
| 2427 |
|
|
@@ -2452,7 +2471,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
| 2452 |
// second part effectively subtracts 8 from each quant value
|
| 2453 |
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
| 2454 |
#else
|
| 2455 |
-
|
| 2456 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2457 |
}
|
| 2458 |
|
|
@@ -2489,7 +2508,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
| 2489 |
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
| 2490 |
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
| 2491 |
#else
|
| 2492 |
-
|
| 2493 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2494 |
}
|
| 2495 |
|
|
@@ -2524,7 +2543,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
| 2524 |
// second part effectively subtracts 16 from each quant value
|
| 2525 |
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
| 2526 |
#else
|
| 2527 |
-
|
| 2528 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2529 |
}
|
| 2530 |
|
|
@@ -2569,7 +2588,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
| 2569 |
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
| 2570 |
|
| 2571 |
#else
|
| 2572 |
-
|
| 2573 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2574 |
}
|
| 2575 |
|
|
@@ -2590,7 +2609,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
| 2590 |
|
| 2591 |
return d8_0*d8_1 * sumi;
|
| 2592 |
#else
|
| 2593 |
-
|
| 2594 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2595 |
}
|
| 2596 |
|
|
@@ -2620,7 +2639,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
| 2620 |
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
| 2621 |
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
| 2622 |
#else
|
| 2623 |
-
|
| 2624 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2625 |
}
|
| 2626 |
|
|
@@ -2655,7 +2674,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
| 2655 |
|
| 2656 |
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
| 2657 |
#else
|
| 2658 |
-
|
| 2659 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2660 |
}
|
| 2661 |
|
|
@@ -2692,7 +2711,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
| 2692 |
|
| 2693 |
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
| 2694 |
#else
|
| 2695 |
-
|
| 2696 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2697 |
}
|
| 2698 |
|
|
@@ -2732,7 +2751,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
| 2732 |
|
| 2733 |
return d3 * sumf;
|
| 2734 |
#else
|
| 2735 |
-
|
| 2736 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2737 |
}
|
| 2738 |
|
|
@@ -2757,7 +2776,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
| 2757 |
|
| 2758 |
return d3*d8 * sumi;
|
| 2759 |
#else
|
| 2760 |
-
|
| 2761 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2762 |
}
|
| 2763 |
|
|
@@ -2790,7 +2809,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
| 2790 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2791 |
|
| 2792 |
#else
|
| 2793 |
-
|
| 2794 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2795 |
}
|
| 2796 |
|
|
@@ -2823,7 +2842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
| 2823 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2824 |
|
| 2825 |
#else
|
| 2826 |
-
|
| 2827 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2828 |
}
|
| 2829 |
|
|
@@ -2863,7 +2882,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|
| 2863 |
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
| 2864 |
|
| 2865 |
#else
|
| 2866 |
-
|
| 2867 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2868 |
}
|
| 2869 |
|
|
@@ -2896,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|
| 2896 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2897 |
|
| 2898 |
#else
|
| 2899 |
-
|
| 2900 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2901 |
}
|
| 2902 |
|
|
@@ -2926,7 +2945,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
| 2926 |
|
| 2927 |
return d*sumf;
|
| 2928 |
#else
|
| 2929 |
-
|
| 2930 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2931 |
}
|
| 2932 |
|
|
@@ -2957,7 +2976,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
| 2957 |
return d6 * sumf_d;
|
| 2958 |
|
| 2959 |
#else
|
| 2960 |
-
|
| 2961 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2962 |
}
|
| 2963 |
|
|
@@ -3823,7 +3842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
| 3823 |
return dall * sumf_d - dmin * sumf_m;
|
| 3824 |
|
| 3825 |
#else
|
| 3826 |
-
|
| 3827 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 3828 |
|
| 3829 |
#endif
|
|
@@ -4006,7 +4025,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
| 4006 |
return d * sumf_d;
|
| 4007 |
|
| 4008 |
#else
|
| 4009 |
-
|
| 4010 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 4011 |
|
| 4012 |
#endif
|
|
@@ -4501,7 +4520,7 @@ template <bool need_check> static __global__ void
|
|
| 4501 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4502 |
#else
|
| 4503 |
(void) vec_dot_q4_0_q8_1_mul_mat;
|
| 4504 |
-
|
| 4505 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4506 |
}
|
| 4507 |
|
|
@@ -4570,7 +4589,7 @@ template <bool need_check> static __global__ void
|
|
| 4570 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4571 |
#else
|
| 4572 |
(void) vec_dot_q4_1_q8_1_mul_mat;
|
| 4573 |
-
|
| 4574 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4575 |
}
|
| 4576 |
|
|
@@ -4637,7 +4656,7 @@ template <bool need_check> static __global__ void
|
|
| 4637 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4638 |
#else
|
| 4639 |
(void) vec_dot_q5_0_q8_1_mul_mat;
|
| 4640 |
-
|
| 4641 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4642 |
}
|
| 4643 |
|
|
@@ -4704,7 +4723,7 @@ mul_mat_q5_1(
|
|
| 4704 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4705 |
#else
|
| 4706 |
(void) vec_dot_q5_1_q8_1_mul_mat;
|
| 4707 |
-
|
| 4708 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4709 |
}
|
| 4710 |
|
|
@@ -4771,7 +4790,7 @@ template <bool need_check> static __global__ void
|
|
| 4771 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4772 |
#else
|
| 4773 |
(void) vec_dot_q8_0_q8_1_mul_mat;
|
| 4774 |
-
|
| 4775 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4776 |
}
|
| 4777 |
|
|
@@ -4838,7 +4857,7 @@ mul_mat_q2_K(
|
|
| 4838 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4839 |
#else
|
| 4840 |
(void) vec_dot_q2_K_q8_1_mul_mat;
|
| 4841 |
-
|
| 4842 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4843 |
}
|
| 4844 |
|
|
@@ -4907,7 +4926,7 @@ template <bool need_check> static __global__ void
|
|
| 4907 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4908 |
#else
|
| 4909 |
(void) vec_dot_q3_K_q8_1_mul_mat;
|
| 4910 |
-
|
| 4911 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4912 |
}
|
| 4913 |
|
|
@@ -4976,7 +4995,7 @@ template <bool need_check> static __global__ void
|
|
| 4976 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4977 |
#else
|
| 4978 |
(void) vec_dot_q4_K_q8_1_mul_mat;
|
| 4979 |
-
|
| 4980 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4981 |
}
|
| 4982 |
|
|
@@ -5043,7 +5062,7 @@ mul_mat_q5_K(
|
|
| 5043 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 5044 |
#else
|
| 5045 |
(void) vec_dot_q5_K_q8_1_mul_mat;
|
| 5046 |
-
|
| 5047 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 5048 |
}
|
| 5049 |
|
|
@@ -5112,7 +5131,7 @@ template <bool need_check> static __global__ void
|
|
| 5112 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 5113 |
#else
|
| 5114 |
(void) vec_dot_q6_K_q8_1_mul_mat;
|
| 5115 |
-
|
| 5116 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 5117 |
}
|
| 5118 |
|
|
@@ -5835,7 +5854,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
|
| 5835 |
}
|
| 5836 |
#else
|
| 5837 |
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
| 5838 |
-
|
| 5839 |
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 5840 |
}
|
| 5841 |
|
|
|
|
| 13 |
#include <map>
|
| 14 |
#include <array>
|
| 15 |
|
| 16 |
+
// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
|
| 17 |
+
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
| 18 |
+
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
| 19 |
+
|
| 20 |
#if defined(GGML_USE_HIPBLAS)
|
| 21 |
#include <hip/hip_runtime.h>
|
| 22 |
#include <hipblas/hipblas.h>
|
|
|
|
| 588 |
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
| 589 |
|
| 590 |
[[noreturn]]
|
| 591 |
+
static __device__ void no_device_code(
|
| 592 |
+
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
| 593 |
+
|
| 594 |
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 595 |
+
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
| 596 |
+
file_name, line, function_name, arch);
|
| 597 |
+
(void) arch_list;
|
| 598 |
+
#else
|
| 599 |
+
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
| 600 |
+
file_name, line, function_name, arch, arch_list);
|
| 601 |
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 602 |
__trap();
|
| 603 |
|
| 604 |
+
(void) no_device_code; // suppress unused function warning
|
| 605 |
}
|
| 606 |
|
| 607 |
+
#ifdef __CUDA_ARCH__
|
| 608 |
+
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
| 609 |
+
#else
|
| 610 |
+
#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
| 611 |
+
#endif // __CUDA_ARCH__
|
| 612 |
+
|
| 613 |
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
| 614 |
#pragma unroll
|
| 615 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
| 636 |
return a;
|
| 637 |
#else
|
| 638 |
(void) a;
|
| 639 |
+
NO_DEVICE_CODE;
|
| 640 |
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
| 641 |
}
|
| 642 |
|
|
|
|
| 657 |
return x;
|
| 658 |
#else
|
| 659 |
(void) x;
|
| 660 |
+
NO_DEVICE_CODE;
|
| 661 |
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 662 |
}
|
| 663 |
|
|
|
|
| 2440 |
}
|
| 2441 |
#else
|
| 2442 |
(void) vx; (void) y; (void) k;
|
| 2443 |
+
NO_DEVICE_CODE;
|
| 2444 |
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
| 2445 |
}
|
| 2446 |
|
|
|
|
| 2471 |
// second part effectively subtracts 8 from each quant value
|
| 2472 |
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
| 2473 |
#else
|
| 2474 |
+
NO_DEVICE_CODE;
|
| 2475 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2476 |
}
|
| 2477 |
|
|
|
|
| 2508 |
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
| 2509 |
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
| 2510 |
#else
|
| 2511 |
+
NO_DEVICE_CODE;
|
| 2512 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2513 |
}
|
| 2514 |
|
|
|
|
| 2543 |
// second part effectively subtracts 16 from each quant value
|
| 2544 |
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
| 2545 |
#else
|
| 2546 |
+
NO_DEVICE_CODE;
|
| 2547 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2548 |
}
|
| 2549 |
|
|
|
|
| 2588 |
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
| 2589 |
|
| 2590 |
#else
|
| 2591 |
+
NO_DEVICE_CODE;
|
| 2592 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2593 |
}
|
| 2594 |
|
|
|
|
| 2609 |
|
| 2610 |
return d8_0*d8_1 * sumi;
|
| 2611 |
#else
|
| 2612 |
+
NO_DEVICE_CODE;
|
| 2613 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2614 |
}
|
| 2615 |
|
|
|
|
| 2639 |
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
| 2640 |
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
| 2641 |
#else
|
| 2642 |
+
NO_DEVICE_CODE;
|
| 2643 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2644 |
}
|
| 2645 |
|
|
|
|
| 2674 |
|
| 2675 |
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
| 2676 |
#else
|
| 2677 |
+
NO_DEVICE_CODE;
|
| 2678 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2679 |
}
|
| 2680 |
|
|
|
|
| 2711 |
|
| 2712 |
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
| 2713 |
#else
|
| 2714 |
+
NO_DEVICE_CODE;
|
| 2715 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2716 |
}
|
| 2717 |
|
|
|
|
| 2751 |
|
| 2752 |
return d3 * sumf;
|
| 2753 |
#else
|
| 2754 |
+
NO_DEVICE_CODE;
|
| 2755 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2756 |
}
|
| 2757 |
|
|
|
|
| 2776 |
|
| 2777 |
return d3*d8 * sumi;
|
| 2778 |
#else
|
| 2779 |
+
NO_DEVICE_CODE;
|
| 2780 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2781 |
}
|
| 2782 |
|
|
|
|
| 2809 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2810 |
|
| 2811 |
#else
|
| 2812 |
+
NO_DEVICE_CODE;
|
| 2813 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2814 |
}
|
| 2815 |
|
|
|
|
| 2842 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2843 |
|
| 2844 |
#else
|
| 2845 |
+
NO_DEVICE_CODE;
|
| 2846 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2847 |
}
|
| 2848 |
|
|
|
|
| 2882 |
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
| 2883 |
|
| 2884 |
#else
|
| 2885 |
+
NO_DEVICE_CODE;
|
| 2886 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2887 |
}
|
| 2888 |
|
|
|
|
| 2915 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2916 |
|
| 2917 |
#else
|
| 2918 |
+
NO_DEVICE_CODE;
|
| 2919 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2920 |
}
|
| 2921 |
|
|
|
|
| 2945 |
|
| 2946 |
return d*sumf;
|
| 2947 |
#else
|
| 2948 |
+
NO_DEVICE_CODE;
|
| 2949 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2950 |
}
|
| 2951 |
|
|
|
|
| 2976 |
return d6 * sumf_d;
|
| 2977 |
|
| 2978 |
#else
|
| 2979 |
+
NO_DEVICE_CODE;
|
| 2980 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2981 |
}
|
| 2982 |
|
|
|
|
| 3842 |
return dall * sumf_d - dmin * sumf_m;
|
| 3843 |
|
| 3844 |
#else
|
| 3845 |
+
NO_DEVICE_CODE;
|
| 3846 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 3847 |
|
| 3848 |
#endif
|
|
|
|
| 4025 |
return d * sumf_d;
|
| 4026 |
|
| 4027 |
#else
|
| 4028 |
+
NO_DEVICE_CODE;
|
| 4029 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 4030 |
|
| 4031 |
#endif
|
|
|
|
| 4520 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4521 |
#else
|
| 4522 |
(void) vec_dot_q4_0_q8_1_mul_mat;
|
| 4523 |
+
NO_DEVICE_CODE;
|
| 4524 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4525 |
}
|
| 4526 |
|
|
|
|
| 4589 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4590 |
#else
|
| 4591 |
(void) vec_dot_q4_1_q8_1_mul_mat;
|
| 4592 |
+
NO_DEVICE_CODE;
|
| 4593 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4594 |
}
|
| 4595 |
|
|
|
|
| 4656 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4657 |
#else
|
| 4658 |
(void) vec_dot_q5_0_q8_1_mul_mat;
|
| 4659 |
+
NO_DEVICE_CODE;
|
| 4660 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4661 |
}
|
| 4662 |
|
|
|
|
| 4723 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4724 |
#else
|
| 4725 |
(void) vec_dot_q5_1_q8_1_mul_mat;
|
| 4726 |
+
NO_DEVICE_CODE;
|
| 4727 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4728 |
}
|
| 4729 |
|
|
|
|
| 4790 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4791 |
#else
|
| 4792 |
(void) vec_dot_q8_0_q8_1_mul_mat;
|
| 4793 |
+
NO_DEVICE_CODE;
|
| 4794 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4795 |
}
|
| 4796 |
|
|
|
|
| 4857 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4858 |
#else
|
| 4859 |
(void) vec_dot_q2_K_q8_1_mul_mat;
|
| 4860 |
+
NO_DEVICE_CODE;
|
| 4861 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4862 |
}
|
| 4863 |
|
|
|
|
| 4926 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4927 |
#else
|
| 4928 |
(void) vec_dot_q3_K_q8_1_mul_mat;
|
| 4929 |
+
NO_DEVICE_CODE;
|
| 4930 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4931 |
}
|
| 4932 |
|
|
|
|
| 4995 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4996 |
#else
|
| 4997 |
(void) vec_dot_q4_K_q8_1_mul_mat;
|
| 4998 |
+
NO_DEVICE_CODE;
|
| 4999 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 5000 |
}
|
| 5001 |
|
|
|
|
| 5062 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 5063 |
#else
|
| 5064 |
(void) vec_dot_q5_K_q8_1_mul_mat;
|
| 5065 |
+
NO_DEVICE_CODE;
|
| 5066 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 5067 |
}
|
| 5068 |
|
|
|
|
| 5131 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 5132 |
#else
|
| 5133 |
(void) vec_dot_q6_K_q8_1_mul_mat;
|
| 5134 |
+
NO_DEVICE_CODE;
|
| 5135 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 5136 |
}
|
| 5137 |
|
|
|
|
| 5854 |
}
|
| 5855 |
#else
|
| 5856 |
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
| 5857 |
+
NO_DEVICE_CODE;
|
| 5858 |
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 5859 |
}
|
| 5860 |
|