Neo Zhang Jianyu commited on
Commit
6d1ba81
·
1 Parent(s): a06cbc7

fix mul_mat_id() for new input, make the ut pass (llama/6682)

Browse files
Files changed (1) hide show
  1. ggml-sycl.cpp +50 -46
ggml-sycl.cpp CHANGED
@@ -15996,73 +15996,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
15996
  static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
15997
  const ggml_tensor *src1,
15998
  ggml_tensor *dst) try {
15999
- #if 0
16000
- ggml_sycl_mul_mat_id_sycl(dst);
16001
- // TODO: mmq/mmv support
16002
- #endif
16003
 
16004
- const int64_t nb11 = src1->nb[1];
16005
- const int64_t nb1 = dst->nb[1];
16006
 
16007
- const struct ggml_tensor * ids = src0;
16008
- const int32_t id = ((int32_t *) dst->op_params)[0];
16009
- const int32_t n_as = ((int32_t *) dst->op_params)[1];
16010
 
16011
  std::vector<char> ids_host(ggml_nbytes(ids));
 
16012
 
16013
- const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
16014
-
16015
- if (ids->backend == GGML_BACKEND_TYPE_GPU) {
16016
- const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
16017
- SYCL_CHECK(CHECK_TRY_ERROR(
16018
- stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait()));
16019
- // SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
16020
- } else {
16021
- memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
16022
- }
16023
 
16024
- const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
16025
- const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
 
 
 
 
16026
 
 
16027
  ggml_tensor_extra_gpu src1_row_extra;
16028
  ggml_tensor_extra_gpu dst_row_extra;
16029
 
 
16030
  ggml_tensor src1_row = *src1;
16031
  ggml_tensor dst_row = *dst;
16032
 
16033
  src1_row.backend = GGML_BACKEND_TYPE_GPU;
16034
  dst_row.backend = GGML_BACKEND_TYPE_GPU;
16035
 
 
16036
  src1_row.extra = &src1_row_extra;
16037
  dst_row.extra = &dst_row_extra;
16038
 
16039
- char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
16040
- (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
16041
- char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
16042
- (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
 
 
 
 
 
16043
 
16044
- if (src1->ne[1] == 1) {
16045
- GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
16046
- GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
16047
 
 
16048
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
16049
- //int32_t row_id;
16050
- //SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0]));
16051
- //SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0]));
16052
-
16053
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
16054
 
16055
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
16056
 
16057
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
 
 
 
 
 
16058
 
16059
- src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
16060
- src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
16061
-
16062
- dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
16063
- dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
16064
-
16065
- ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
16066
  }
16067
  } else {
16068
  sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
@@ -16072,8 +16075,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16072
  dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
16073
 
16074
  for (int32_t row_id = 0; row_id < n_as; ++row_id) {
16075
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
16076
-
16077
  int64_t num_src1_rows = 0;
16078
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
16079
  const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
@@ -16086,7 +16087,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16086
 
16087
  SYCL_CHECK(CHECK_TRY_ERROR(
16088
  stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
16089
- src1_original + i01 * nb11, nb11).wait()));
16090
  num_src1_rows++;
16091
  }
16092
 
@@ -16094,6 +16095,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16094
  continue;
16095
  }
16096
 
 
 
 
16097
  src1_row.ne[1] = num_src1_rows;
16098
  dst_row.ne[1] = num_src1_rows;
16099
 
@@ -16105,7 +16109,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16105
  dst_row.nb[2] = num_src1_rows*nb1;
16106
  dst_row.nb[3] = num_src1_rows*nb1;
16107
 
16108
- ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
16109
 
16110
  num_src1_rows = 0;
16111
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -16119,7 +16123,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16119
 
16120
  SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
16121
  dst_original + i01 * nb1,
16122
- dst_contiguous.get() + num_src1_rows * nb1, nb1).wait()));
16123
  num_src1_rows++;
16124
  }
16125
  }
 
15996
  static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
15997
  const ggml_tensor *src1,
15998
  ggml_tensor *dst) try {
15999
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
16000
+ "mul_mat_id does not support split buffers");
16001
+ const ggml_tensor *ids = dst->src[2];
16002
+ const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
16003
 
16004
+ const size_t nb11 = src1->nb[1];
16005
+ const size_t nb1 = dst->nb[1];
16006
 
16007
+ const int32_t id = ((int32_t *)dst->op_params)[0];
16008
+ const int32_t n_as = src0->ne[2];
 
16009
 
16010
  std::vector<char> ids_host(ggml_nbytes(ids));
16011
+ const char *ids_dev = (const char *)ids->data;
16012
 
16013
+ SYCL_CHECK(CHECK_TRY_ERROR(
16014
+ stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
16015
+ SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
 
 
 
 
 
 
 
16016
 
16017
+ const ggml_tensor_extra_gpu *src0_extra =
16018
+ (const ggml_tensor_extra_gpu *)src0->extra;
16019
+ const ggml_tensor_extra_gpu *src1_extra =
16020
+ (const ggml_tensor_extra_gpu *)src1->extra;
16021
+ const ggml_tensor_extra_gpu *dst_extra =
16022
+ (const ggml_tensor_extra_gpu *)dst->extra;
16023
 
16024
+ ggml_tensor_extra_gpu src0_row_extra;
16025
  ggml_tensor_extra_gpu src1_row_extra;
16026
  ggml_tensor_extra_gpu dst_row_extra;
16027
 
16028
+ ggml_tensor src0_row = *src0;
16029
  ggml_tensor src1_row = *src1;
16030
  ggml_tensor dst_row = *dst;
16031
 
16032
  src1_row.backend = GGML_BACKEND_TYPE_GPU;
16033
  dst_row.backend = GGML_BACKEND_TYPE_GPU;
16034
 
16035
+ src0_row.extra = &src0_row_extra;
16036
  src1_row.extra = &src1_row_extra;
16037
  dst_row.extra = &dst_row_extra;
16038
 
16039
+ char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
16040
+ ? (char *)src0->data
16041
+ : (char *)src0_extra->data_device[g_main_device];
16042
+ char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
16043
+ ? (char *)src1->data
16044
+ : (char *)src1_extra->data_device[g_main_device];
16045
+ char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
16046
+ ? (char *)dst->data
16047
+ : (char *)dst_extra->data_device[g_main_device];
16048
 
16049
+ src0_row.ne[2] = 1;
16050
+ src0_row.ne[3] = 1;
16051
+ src0_row.nb[3] = src0->nb[2];
16052
 
16053
+ if (src1->ne[1] == 1) {
16054
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
16055
+ const int32_t row_id =
16056
+ *(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
16057
+ id * ids->nb[0]);
 
 
16058
 
16059
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
16060
 
16061
+ src0_row_extra.data_device[g_main_device] =
16062
+ src0_original + row_id * src0->nb[2];
16063
+ src1_row_extra.data_device[g_main_device] =
16064
+ src1_original + i01 * src1->nb[1];
16065
+ dst_row_extra.data_device[g_main_device] =
16066
+ dst_original + i01 * dst->nb[1];
16067
 
16068
+ ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
 
 
 
 
 
 
16069
  }
16070
  } else {
16071
  sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
 
16075
  dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
16076
 
16077
  for (int32_t row_id = 0; row_id < n_as; ++row_id) {
 
 
16078
  int64_t num_src1_rows = 0;
16079
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
16080
  const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
 
16087
 
16088
  SYCL_CHECK(CHECK_TRY_ERROR(
16089
  stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
16090
+ src1_original + i01 * nb11, nb11)));
16091
  num_src1_rows++;
16092
  }
16093
 
 
16095
  continue;
16096
  }
16097
 
16098
+ src0_row_extra.data_device[g_main_device] =
16099
+ src0_original + row_id * src0->nb[2];
16100
+
16101
  src1_row.ne[1] = num_src1_rows;
16102
  dst_row.ne[1] = num_src1_rows;
16103
 
 
16109
  dst_row.nb[2] = num_src1_rows*nb1;
16110
  dst_row.nb[3] = num_src1_rows*nb1;
16111
 
16112
+ ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
16113
 
16114
  num_src1_rows = 0;
16115
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
 
16123
 
16124
  SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
16125
  dst_original + i01 * nb1,
16126
+ dst_contiguous.get() + num_src1_rows * nb1, nb1)));
16127
  num_src1_rows++;
16128
  }
16129
  }