JohannesGaessler commited on
Commit
e96ba7d
·
unverified ·
1 Parent(s): 7df090b

CUDA: more info when no device code (llama/5088)

Browse files
Files changed (1) hide show
  1. ggml-cuda.cu +54 -35
ggml-cuda.cu CHANGED
@@ -13,6 +13,10 @@
13
  #include <map>
14
  #include <array>
15
 
 
 
 
 
16
  #if defined(GGML_USE_HIPBLAS)
17
  #include <hip/hip_runtime.h>
18
  #include <hipblas/hipblas.h>
@@ -584,13 +588,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
584
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
585
 
586
  [[noreturn]]
587
- static __device__ void bad_arch() {
588
- printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
 
 
 
 
 
 
 
 
 
589
  __trap();
590
 
591
- (void) bad_arch; // suppress unused function warning
592
  }
593
 
 
 
 
 
 
 
594
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
595
  #pragma unroll
596
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -617,7 +636,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
617
  return a;
618
  #else
619
  (void) a;
620
- bad_arch();
621
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
622
  }
623
 
@@ -638,7 +657,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
638
  return x;
639
  #else
640
  (void) x;
641
- bad_arch();
642
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
643
  }
644
 
@@ -2421,7 +2440,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
2421
  }
2422
  #else
2423
  (void) vx; (void) y; (void) k;
2424
- bad_arch();
2425
  #endif // __CUDA_ARCH__ >= CC_PASCAL
2426
  }
2427
 
@@ -2452,7 +2471,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
2452
  // second part effectively subtracts 8 from each quant value
2453
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
2454
  #else
2455
- bad_arch();
2456
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2457
  }
2458
 
@@ -2489,7 +2508,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
2489
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2490
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2491
  #else
2492
- bad_arch();
2493
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2494
  }
2495
 
@@ -2524,7 +2543,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
2524
  // second part effectively subtracts 16 from each quant value
2525
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
2526
  #else
2527
- bad_arch();
2528
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2529
  }
2530
 
@@ -2569,7 +2588,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
2569
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2570
 
2571
  #else
2572
- bad_arch();
2573
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2574
  }
2575
 
@@ -2590,7 +2609,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
2590
 
2591
  return d8_0*d8_1 * sumi;
2592
  #else
2593
- bad_arch();
2594
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2595
  }
2596
 
@@ -2620,7 +2639,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
2620
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2621
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2622
  #else
2623
- bad_arch();
2624
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2625
  }
2626
 
@@ -2655,7 +2674,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
2655
 
2656
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
2657
  #else
2658
- bad_arch();
2659
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2660
  }
2661
 
@@ -2692,7 +2711,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
2692
 
2693
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
2694
  #else
2695
- bad_arch();
2696
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2697
  }
2698
 
@@ -2732,7 +2751,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
2732
 
2733
  return d3 * sumf;
2734
  #else
2735
- bad_arch();
2736
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2737
  }
2738
 
@@ -2757,7 +2776,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
2757
 
2758
  return d3*d8 * sumi;
2759
  #else
2760
- bad_arch();
2761
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2762
  }
2763
 
@@ -2790,7 +2809,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
2790
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2791
 
2792
  #else
2793
- bad_arch();
2794
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2795
  }
2796
 
@@ -2823,7 +2842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
2823
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2824
 
2825
  #else
2826
- bad_arch();
2827
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2828
  }
2829
 
@@ -2863,7 +2882,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
2863
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
2864
 
2865
  #else
2866
- bad_arch();
2867
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2868
  }
2869
 
@@ -2896,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
2896
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2897
 
2898
  #else
2899
- bad_arch();
2900
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2901
  }
2902
 
@@ -2926,7 +2945,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
2926
 
2927
  return d*sumf;
2928
  #else
2929
- bad_arch();
2930
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2931
  }
2932
 
@@ -2957,7 +2976,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
2957
  return d6 * sumf_d;
2958
 
2959
  #else
2960
- bad_arch();
2961
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2962
  }
2963
 
@@ -3823,7 +3842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3823
  return dall * sumf_d - dmin * sumf_m;
3824
 
3825
  #else
3826
- bad_arch();
3827
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3828
 
3829
  #endif
@@ -4006,7 +4025,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
4006
  return d * sumf_d;
4007
 
4008
  #else
4009
- bad_arch();
4010
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
4011
 
4012
  #endif
@@ -4501,7 +4520,7 @@ template <bool need_check> static __global__ void
4501
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4502
  #else
4503
  (void) vec_dot_q4_0_q8_1_mul_mat;
4504
- bad_arch();
4505
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4506
  }
4507
 
@@ -4570,7 +4589,7 @@ template <bool need_check> static __global__ void
4570
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4571
  #else
4572
  (void) vec_dot_q4_1_q8_1_mul_mat;
4573
- bad_arch();
4574
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4575
  }
4576
 
@@ -4637,7 +4656,7 @@ template <bool need_check> static __global__ void
4637
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4638
  #else
4639
  (void) vec_dot_q5_0_q8_1_mul_mat;
4640
- bad_arch();
4641
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4642
  }
4643
 
@@ -4704,7 +4723,7 @@ mul_mat_q5_1(
4704
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4705
  #else
4706
  (void) vec_dot_q5_1_q8_1_mul_mat;
4707
- bad_arch();
4708
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4709
  }
4710
 
@@ -4771,7 +4790,7 @@ template <bool need_check> static __global__ void
4771
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4772
  #else
4773
  (void) vec_dot_q8_0_q8_1_mul_mat;
4774
- bad_arch();
4775
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4776
  }
4777
 
@@ -4838,7 +4857,7 @@ mul_mat_q2_K(
4838
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4839
  #else
4840
  (void) vec_dot_q2_K_q8_1_mul_mat;
4841
- bad_arch();
4842
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4843
  }
4844
 
@@ -4907,7 +4926,7 @@ template <bool need_check> static __global__ void
4907
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4908
  #else
4909
  (void) vec_dot_q3_K_q8_1_mul_mat;
4910
- bad_arch();
4911
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4912
  }
4913
 
@@ -4976,7 +4995,7 @@ template <bool need_check> static __global__ void
4976
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4977
  #else
4978
  (void) vec_dot_q4_K_q8_1_mul_mat;
4979
- bad_arch();
4980
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4981
  }
4982
 
@@ -5043,7 +5062,7 @@ mul_mat_q5_K(
5043
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
5044
  #else
5045
  (void) vec_dot_q5_K_q8_1_mul_mat;
5046
- bad_arch();
5047
  #endif // __CUDA_ARCH__ >= CC_VOLTA
5048
  }
5049
 
@@ -5112,7 +5131,7 @@ template <bool need_check> static __global__ void
5112
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
5113
  #else
5114
  (void) vec_dot_q6_K_q8_1_mul_mat;
5115
- bad_arch();
5116
  #endif // __CUDA_ARCH__ >= CC_VOLTA
5117
  }
5118
 
@@ -5835,7 +5854,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
5835
  }
5836
  #else
5837
  (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
5838
- bad_arch();
5839
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5840
  }
5841
 
 
13
  #include <map>
14
  #include <array>
15
 
16
+ // stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
17
+ #define STRINGIZE_IMPL(...) #__VA_ARGS__
18
+ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
19
+
20
  #if defined(GGML_USE_HIPBLAS)
21
  #include <hip/hip_runtime.h>
22
  #include <hipblas/hipblas.h>
 
588
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
589
 
590
  [[noreturn]]
591
+ static __device__ void no_device_code(
592
+ const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
593
+
594
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
595
+ printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
596
+ file_name, line, function_name, arch);
597
+ (void) arch_list;
598
+ #else
599
+ printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
600
+ file_name, line, function_name, arch, arch_list);
601
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
602
  __trap();
603
 
604
+ (void) no_device_code; // suppress unused function warning
605
  }
606
 
607
+ #ifdef __CUDA_ARCH__
608
+ #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
609
+ #else
610
+ #define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
611
+ #endif // __CUDA_ARCH__
612
+
613
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
614
  #pragma unroll
615
  for (int mask = 16; mask > 0; mask >>= 1) {
 
636
  return a;
637
  #else
638
  (void) a;
639
+ NO_DEVICE_CODE;
640
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
641
  }
642
 
 
657
  return x;
658
  #else
659
  (void) x;
660
+ NO_DEVICE_CODE;
661
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
662
  }
663
 
 
2440
  }
2441
  #else
2442
  (void) vx; (void) y; (void) k;
2443
+ NO_DEVICE_CODE;
2444
  #endif // __CUDA_ARCH__ >= CC_PASCAL
2445
  }
2446
 
 
2471
  // second part effectively subtracts 8 from each quant value
2472
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
2473
  #else
2474
+ NO_DEVICE_CODE;
2475
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2476
  }
2477
 
 
2508
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2509
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2510
  #else
2511
+ NO_DEVICE_CODE;
2512
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2513
  }
2514
 
 
2543
  // second part effectively subtracts 16 from each quant value
2544
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
2545
  #else
2546
+ NO_DEVICE_CODE;
2547
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2548
  }
2549
 
 
2588
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2589
 
2590
  #else
2591
+ NO_DEVICE_CODE;
2592
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2593
  }
2594
 
 
2609
 
2610
  return d8_0*d8_1 * sumi;
2611
  #else
2612
+ NO_DEVICE_CODE;
2613
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2614
  }
2615
 
 
2639
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2640
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2641
  #else
2642
+ NO_DEVICE_CODE;
2643
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2644
  }
2645
 
 
2674
 
2675
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
2676
  #else
2677
+ NO_DEVICE_CODE;
2678
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2679
  }
2680
 
 
2711
 
2712
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
2713
  #else
2714
+ NO_DEVICE_CODE;
2715
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2716
  }
2717
 
 
2751
 
2752
  return d3 * sumf;
2753
  #else
2754
+ NO_DEVICE_CODE;
2755
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2756
  }
2757
 
 
2776
 
2777
  return d3*d8 * sumi;
2778
  #else
2779
+ NO_DEVICE_CODE;
2780
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2781
  }
2782
 
 
2809
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2810
 
2811
  #else
2812
+ NO_DEVICE_CODE;
2813
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2814
  }
2815
 
 
2842
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2843
 
2844
  #else
2845
+ NO_DEVICE_CODE;
2846
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2847
  }
2848
 
 
2882
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
2883
 
2884
  #else
2885
+ NO_DEVICE_CODE;
2886
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2887
  }
2888
 
 
2915
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2916
 
2917
  #else
2918
+ NO_DEVICE_CODE;
2919
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2920
  }
2921
 
 
2945
 
2946
  return d*sumf;
2947
  #else
2948
+ NO_DEVICE_CODE;
2949
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2950
  }
2951
 
 
2976
  return d6 * sumf_d;
2977
 
2978
  #else
2979
+ NO_DEVICE_CODE;
2980
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2981
  }
2982
 
 
3842
  return dall * sumf_d - dmin * sumf_m;
3843
 
3844
  #else
3845
+ NO_DEVICE_CODE;
3846
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3847
 
3848
  #endif
 
4025
  return d * sumf_d;
4026
 
4027
  #else
4028
+ NO_DEVICE_CODE;
4029
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
4030
 
4031
  #endif
 
4520
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4521
  #else
4522
  (void) vec_dot_q4_0_q8_1_mul_mat;
4523
+ NO_DEVICE_CODE;
4524
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4525
  }
4526
 
 
4589
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4590
  #else
4591
  (void) vec_dot_q4_1_q8_1_mul_mat;
4592
+ NO_DEVICE_CODE;
4593
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4594
  }
4595
 
 
4656
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4657
  #else
4658
  (void) vec_dot_q5_0_q8_1_mul_mat;
4659
+ NO_DEVICE_CODE;
4660
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4661
  }
4662
 
 
4723
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4724
  #else
4725
  (void) vec_dot_q5_1_q8_1_mul_mat;
4726
+ NO_DEVICE_CODE;
4727
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4728
  }
4729
 
 
4790
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4791
  #else
4792
  (void) vec_dot_q8_0_q8_1_mul_mat;
4793
+ NO_DEVICE_CODE;
4794
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4795
  }
4796
 
 
4857
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4858
  #else
4859
  (void) vec_dot_q2_K_q8_1_mul_mat;
4860
+ NO_DEVICE_CODE;
4861
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4862
  }
4863
 
 
4926
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4927
  #else
4928
  (void) vec_dot_q3_K_q8_1_mul_mat;
4929
+ NO_DEVICE_CODE;
4930
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4931
  }
4932
 
 
4995
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4996
  #else
4997
  (void) vec_dot_q4_K_q8_1_mul_mat;
4998
+ NO_DEVICE_CODE;
4999
  #endif // __CUDA_ARCH__ >= CC_VOLTA
5000
  }
5001
 
 
5062
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
5063
  #else
5064
  (void) vec_dot_q5_K_q8_1_mul_mat;
5065
+ NO_DEVICE_CODE;
5066
  #endif // __CUDA_ARCH__ >= CC_VOLTA
5067
  }
5068
 
 
5131
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
5132
  #else
5133
  (void) vec_dot_q6_K_q8_1_mul_mat;
5134
+ NO_DEVICE_CODE;
5135
  #endif // __CUDA_ARCH__ >= CC_VOLTA
5136
  }
5137
 
 
5854
  }
5855
  #else
5856
  (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
5857
+ NO_DEVICE_CODE;
5858
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5859
  }
5860