Abhilash Majumder jianyuzh KevinLy hengyu ggerganov commited on
Commit
01169e0
·
unverified ·
1 Parent(s): 1bbb1a9

ggml : add unified SYCL backend for Intel GPUs (llama/2690)

Browse files

* first update for migration

* update init_cublas

* add debug functio, commit all help code

* step 1

* step 2

* step3 add fp16, slower 31->28

* add GGML_LIST_DEVICE function

* step 5 format device and print

* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue

* support main device is non-zero

* step7 add debug for code path, rm log

* step 8, rename all macro & func from cuda by sycl

* fix error of select non-zero device, format device list

* ren ggml-sycl.hpp -> ggml-sycl.h

* clear CMAKE to rm unused lib and options

* correct queue: rm dtct:get_queue

* add print tensor function to debug

* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481

* summary dpct definition in one header file to replace folder:dpct

* refactor device log

* mv dpct definition from folder dpct to ggml-sycl.h

* update readme, refactor build script

* fix build with sycl

* set nthread=1 when sycl, increase performance

* add run script, comment debug code

* add ls-sycl-device tool

* add ls-sycl-device, rm unused files

* rm rear space

* dos2unix

* Update README_sycl.md

* fix return type

* remove sycl version from include path

* restore rm code to fix hang issue

* add syc and link for sycl readme

* rm original sycl code before refactor

* fix code err

* add know issue for pvc hang issue

* enable SYCL_F16 support

* align pr4766

* check for sycl blas, better performance

* cleanup 1

* remove extra endif

* add build&run script, clean CMakefile, update guide by review comments

* rename macro to intel hardware

* editor config format

* format fixes

* format fixes

* editor format fix

* Remove unused headers

* skip build sycl tool for other code path

* replace tab by space

* fix blas matmul function

* fix mac build

* restore hip dependency

* fix conflict

* ren as review comments

* mv internal function to .cpp file

* export funciton print_sycl_devices(), mv class dpct definition to source file

* update CI/action for sycl code, fix CI error of repeat/dup

* fix action ID format issue

* rm unused strategy

* enable llama_f16 in ci

* fix conflict

* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml

* fix ci cases for unsupported data type

* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL

* revert hip cmake changes

* fix indent

* add prefix in func name

* revert no mmq

* rm cpu blas duplicate

* fix no_new_line

* fix src1->type==F16 bug.

* pass batch offset for F16 src1

* fix batch error

* fix wrong code

* revert sycl checking in test-sampling

* pass void as arguments of ggml_backend_sycl_print_sycl_devices

* remove extra blank line in test-sampling

* revert setting n_threads in sycl

* implement std::isinf for icpx with fast math.

* Update ci/run.sh

Co-authored-by: Georgi Gerganov <[email protected]>

* Update examples/sycl/run-llama2.sh

Co-authored-by: Georgi Gerganov <[email protected]>

* Update examples/sycl/run-llama2.sh

Co-authored-by: Georgi Gerganov <[email protected]>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <[email protected]>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <[email protected]>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <[email protected]>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <[email protected]>

* add copyright and MIT license declare

* update the cmd example

---------

Co-authored-by: jianyuzh <[email protected]>
Co-authored-by: luoyu-intel <[email protected]>
Co-authored-by: Meng, Hengyu <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (3) hide show
  1. ggml-backend.c +5 -0
  2. ggml.c +20 -2
  3. ggml.h +1 -0
ggml-backend.c CHANGED
@@ -339,6 +339,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
339
  ggml_backend_cuda_reg_devices();
340
  #endif
341
 
 
 
 
 
 
342
  #ifdef GGML_USE_METAL
343
  extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
344
  extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 
339
  ggml_backend_cuda_reg_devices();
340
  #endif
341
 
342
+ #ifdef GGML_USE_SYCL
343
+ extern void ggml_backend_sycl_reg_devices(void);
344
+ ggml_backend_sycl_reg_devices();
345
+ #endif
346
+
347
  #ifdef GGML_USE_METAL
348
  extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
349
  extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
ggml.c CHANGED
@@ -248,6 +248,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
248
  #include "ggml-cuda.h"
249
  #elif defined(GGML_USE_CLBLAST)
250
  #include "ggml-opencl.h"
 
 
251
  #endif
252
 
253
  // floating point type used to accumulate sums
@@ -2293,6 +2295,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2293
  ggml_init_cublas();
2294
  #elif defined(GGML_USE_CLBLAST)
2295
  ggml_cl_init();
 
 
2296
  #endif
2297
 
2298
  ggml_setup_op_has_task_pass();
@@ -14701,6 +14705,12 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14701
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14702
  #endif // GGML_USE_CUBLAS
14703
 
 
 
 
 
 
 
14704
  switch (tensor->op) {
14705
  case GGML_OP_DUP:
14706
  {
@@ -20280,7 +20290,7 @@ int ggml_cpu_has_wasm_simd(void) {
20280
  }
20281
 
20282
  int ggml_cpu_has_blas(void) {
20283
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
20284
  return 1;
20285
  #else
20286
  return 0;
@@ -20303,8 +20313,16 @@ int ggml_cpu_has_clblast(void) {
20303
  #endif
20304
  }
20305
 
 
 
 
 
 
 
 
 
20306
  int ggml_cpu_has_gpublas(void) {
20307
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
20308
  }
20309
 
20310
  int ggml_cpu_has_sse3(void) {
 
248
  #include "ggml-cuda.h"
249
  #elif defined(GGML_USE_CLBLAST)
250
  #include "ggml-opencl.h"
251
+ #elif defined(GGML_USE_SYCL)
252
+ #include "ggml-sycl.h"
253
  #endif
254
 
255
  // floating point type used to accumulate sums
 
2295
  ggml_init_cublas();
2296
  #elif defined(GGML_USE_CLBLAST)
2297
  ggml_cl_init();
2298
+ #elif defined(GGML_USE_SYCL)
2299
+ ggml_init_sycl();
2300
  #endif
2301
 
2302
  ggml_setup_op_has_task_pass();
 
14705
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14706
  #endif // GGML_USE_CUBLAS
14707
 
14708
+ #ifdef GGML_USE_SYCL
14709
+ bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
14710
+ if (skip_cpu) {
14711
+ return;
14712
+ }
14713
+ #endif // GGML_USE_SYCL
14714
  switch (tensor->op) {
14715
  case GGML_OP_DUP:
14716
  {
 
20290
  }
20291
 
20292
  int ggml_cpu_has_blas(void) {
20293
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
20294
  return 1;
20295
  #else
20296
  return 0;
 
20313
  #endif
20314
  }
20315
 
20316
+ int ggml_cpu_has_sycl(void) {
20317
+ #if defined(GGML_USE_SYCL)
20318
+ return 1;
20319
+ #else
20320
+ return 0;
20321
+ #endif
20322
+ }
20323
+
20324
  int ggml_cpu_has_gpublas(void) {
20325
+ return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_sycl();
20326
  }
20327
 
20328
  int ggml_cpu_has_sse3(void) {
ggml.h CHANGED
@@ -2266,6 +2266,7 @@ extern "C" {
2266
  GGML_API int ggml_cpu_has_gpublas (void);
2267
  GGML_API int ggml_cpu_has_sse3 (void);
2268
  GGML_API int ggml_cpu_has_ssse3 (void);
 
2269
  GGML_API int ggml_cpu_has_vsx (void);
2270
 
2271
  //
 
2266
  GGML_API int ggml_cpu_has_gpublas (void);
2267
  GGML_API int ggml_cpu_has_sse3 (void);
2268
  GGML_API int ggml_cpu_has_ssse3 (void);
2269
+ GGML_API int ggml_cpu_has_sycl (void);
2270
  GGML_API int ggml_cpu_has_vsx (void);
2271
 
2272
  //