R0CKSTAR commited on
Commit
0fd6120
·
1 Parent(s): 9fb68a1

cuda : remove nrows_x in mul_mat_q_process_tile (llama/13325)

Browse files
Files changed (1) hide show
  1. ggml/src/ggml-cuda/mmq.cuh +4 -4
ggml/src/ggml-cuda/mmq.cuh CHANGED
@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
2522
  static __device__ __forceinline__ void mul_mat_q_process_tile(
2523
  const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
2524
  const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
2525
- const int nrows_x, const int stride_row_x, const int ncols_y, const int stride_col_dst,
2526
  const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
2527
 
2528
  constexpr int qk = ggml_cuda_type_traits<type>::qk;
@@ -2689,7 +2689,7 @@ static __global__ void mul_mat_q(
2689
 
2690
  constexpr bool fixup = false;
2691
  mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
2692
- (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, stride_row_x, ncols_y, stride_col_dst,
2693
  tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
2694
  return;
2695
  }
@@ -2767,7 +2767,7 @@ static __global__ void mul_mat_q(
2767
 
2768
  constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
2769
  mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
2770
- (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, stride_row_x, ncols_y, stride_col_dst,
2771
  tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
2772
 
2773
  kbc += blocks_per_ne00;
@@ -2834,7 +2834,7 @@ static __global__ void mul_mat_q(
2834
 
2835
  constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
2836
  mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
2837
- (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, stride_row_x, ncols_y, stride_col_dst,
2838
  tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
2839
  }
2840
 
 
2522
  static __device__ __forceinline__ void mul_mat_q_process_tile(
2523
  const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
2524
  const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
2525
+ const int stride_row_x, const int ncols_y, const int stride_col_dst,
2526
  const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
2527
 
2528
  constexpr int qk = ggml_cuda_type_traits<type>::qk;
 
2689
 
2690
  constexpr bool fixup = false;
2691
  mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
2692
+ (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
2693
  tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
2694
  return;
2695
  }
 
2767
 
2768
  constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
2769
  mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
2770
+ (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
2771
  tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
2772
 
2773
  kbc += blocks_per_ne00;
 
2834
 
2835
  constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
2836
  mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
2837
+ (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
2838
  tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
2839
  }
2840